diff mbox series

[1/2] KVM: VMX: fix instruction skipping when handling UD exception

Message ID 8ad4de9dae77ee3690ee9bd3c5a51d235d619eb6.1634870747.git.houwenlong93@linux.alibaba.com (mailing list archive)
State New, archived
Headers show
Series KVM: some fixes about RDMSR/WRMSR instruction emulation | expand

Commit Message

Hou Wenlong Oct. 22, 2021, 2:59 a.m. UTC
When kvm.force_emulation_prefix is enabled, instruction with
kvm prefix would trigger an UD exception and do instruction
emulation. The emulation may need to exit to userspace due
to userspace io, and the complete_userspace_io callback may
skip instruction, i.e. MSR accesses emulation would exit to
userspace if userspace wanted to know about the MSR fault.
However, VM_EXIT_INSTRUCTION_LEN in vmcs is invalid now, it
should use kvm_emulate_instruction() to skip instruction.

Signed-off-by: Hou Wenlong <houwenlong93@linux.alibaba.com>
---
 arch/x86/kvm/vmx/vmx.c | 4 ++--
 arch/x86/kvm/vmx/vmx.h | 9 +++++++++
 2 files changed, 11 insertions(+), 2 deletions(-)

Comments

Sean Christopherson Oct. 26, 2021, 4:37 p.m. UTC | #1
On Fri, Oct 22, 2021, Hou Wenlong wrote:
> When kvm.force_emulation_prefix is enabled, instruction with
> kvm prefix would trigger an UD exception and do instruction
> emulation. The emulation may need to exit to userspace due
> to userspace io, and the complete_userspace_io callback may
> skip instruction, i.e. MSR accesses emulation would exit to
> userspace if userspace wanted to know about the MSR fault.
> However, VM_EXIT_INSTRUCTION_LEN in vmcs is invalid now, it
> should use kvm_emulate_instruction() to skip instruction.
> 
> Signed-off-by: Hou Wenlong <houwenlong93@linux.alibaba.com>
> ---
>  arch/x86/kvm/vmx/vmx.c | 4 ++--
>  arch/x86/kvm/vmx/vmx.h | 9 +++++++++
>  2 files changed, 11 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index 1c8b2b6e7ed9..01049d65da26 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -1501,8 +1501,8 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
>  	 * (namely Hyper-V) don't set it due to it being undefined behavior,
>  	 * i.e. we end up advancing IP with some random value.
>  	 */
> -	if (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||
> -	    exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) {
> +	if (!is_ud_exit(vcpu) && (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||

This is incomplete and is just a workaround for the underlying bug.  The same
mess can occur if the emulator triggers an exit to userspace during "normal"
emulation, e.g. if unrestricted guest is disabled and the guest accesses an MSR
while in Big RM.  In that case, there's no #UD to key off of.

The correct way to fix this is to attach a different callback when the MSR access
comes from the emulator.  I'd say rename the existing complete_emulated_{rd,wr}msr()
callbacks to complete_fast_{rd,wr}msr() to match the port I/O nomenclature.

Something like this (which also has some opportunistic simplification of the
error handling in kvm_emulate_{rd,wr}msr()).

---
 arch/x86/kvm/x86.c | 82 +++++++++++++++++++++++++---------------------
 1 file changed, 45 insertions(+), 37 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ac83d873d65b..7ff5b8d58ca3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1814,18 +1814,44 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
 }
 EXPORT_SYMBOL_GPL(kvm_set_msr);

-static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu)
+static void __complete_emulated_rdmsr(struct kvm_vcpu *vcpu)
 {
-	int err = vcpu->run->msr.error;
-	if (!err) {
+	if (!vcpu->run->msr.error) {
 		kvm_rax_write(vcpu, (u32)vcpu->run->msr.data);
 		kvm_rdx_write(vcpu, vcpu->run->msr.data >> 32);
 	}
+}

-	return static_call(kvm_x86_complete_emulated_msr)(vcpu, err);
+static int complete_emulated_msr_access(struct kvm_vcpu *vcpu)
+{
+	if (vcpu->run->msr.error) {
+		kvm_inject_gp(vcpu, 0);
+		return 1;
+	}
+
+	return kvm_emulate_instruction(vcpu, EMULTYPE_SKIP);
+}
+
+static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu)
+{
+	__complete_emulated_rdmsr(vcpu);
+
+	return complete_emulated_msr_access(vcpu);
 }

 static int complete_emulated_wrmsr(struct kvm_vcpu *vcpu)
+{
+	return complete_emulated_msr_access(vcpu);
+}
+
+static int complete_fast_rdmsr(struct kvm_vcpu *vcpu)
+{
+	__complete_emulated_rdmsr(vcpu);
+
+	return static_call(kvm_x86_complete_emulated_msr)(vcpu, vcpu->run->msr.error);
+}
+
+static int complete_fast_wrmsr(struct kvm_vcpu *vcpu)
 {
 	return static_call(kvm_x86_complete_emulated_msr)(vcpu, vcpu->run->msr.error);
 }
@@ -1864,18 +1890,6 @@ static int kvm_msr_user_space(struct kvm_vcpu *vcpu, u32 index,
 	return 1;
 }

-static int kvm_get_msr_user_space(struct kvm_vcpu *vcpu, u32 index, int r)
-{
-	return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_RDMSR, 0,
-				   complete_emulated_rdmsr, r);
-}
-
-static int kvm_set_msr_user_space(struct kvm_vcpu *vcpu, u32 index, u64 data, int r)
-{
-	return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_WRMSR, data,
-				   complete_emulated_wrmsr, r);
-}
-
 int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu)
 {
 	u32 ecx = kvm_rcx_read(vcpu);
@@ -1883,19 +1897,15 @@ int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu)
 	int r;

 	r = kvm_get_msr(vcpu, ecx, &data);
-
-	/* MSR read failed? See if we should ask user space */
-	if (r && kvm_get_msr_user_space(vcpu, ecx, r)) {
-		/* Bounce to user space */
-		return 0;
-	}
-
 	if (!r) {
 		trace_kvm_msr_read(ecx, data);

 		kvm_rax_write(vcpu, data & -1u);
 		kvm_rdx_write(vcpu, (data >> 32) & -1u);
 	} else {
+		if (kvm_msr_user_space(vcpu, ecx, KVM_EXIT_X86_RDMSR, 0,
+				       complete_fast_rdmsr, r))
+			return 0;
 		trace_kvm_msr_read_ex(ecx);
 	}

@@ -1910,20 +1920,16 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
 	int r;

 	r = kvm_set_msr(vcpu, ecx, data);
-
-	/* MSR write failed? See if we should ask user space */
-	if (r && kvm_set_msr_user_space(vcpu, ecx, data, r))
-		/* Bounce to user space */
-		return 0;
-
-	/* Signal all other negative errors to userspace */
-	if (r < 0)
-		return r;
-
-	if (!r)
+	if (!r) {
 		trace_kvm_msr_write(ecx, data);
-	else
+	} else {
+		if (kvm_msr_user_space(vcpu, ecx, KVM_EXIT_X86_WRMSR, data,
+				       complete_fast_wrmsr, r))
+			return 0;
+		if (r < 0)
+			return r;
 		trace_kvm_msr_write_ex(ecx, data);
+	}

 	return static_call(kvm_x86_complete_emulated_msr)(vcpu, r);
 }
@@ -7387,7 +7393,8 @@ static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,

 	r = kvm_get_msr(vcpu, msr_index, pdata);

-	if (r && kvm_get_msr_user_space(vcpu, msr_index, r)) {
+	if (r && kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_RDMSR, 0,
+				    complete_emulated_rdmsr, r)) {
 		/* Bounce to user space */
 		return X86EMUL_IO_NEEDED;
 	}
@@ -7403,7 +7410,8 @@ static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,

 	r = kvm_set_msr(vcpu, msr_index, data);

-	if (r && kvm_set_msr_user_space(vcpu, msr_index, data, r)) {
+	if (r && kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_WRMSR, data,
+				    complete_emulated_wrmsr, r)) {
 		/* Bounce to user space */
 		return X86EMUL_IO_NEEDED;
 	}
--
Hou Wenlong Oct. 27, 2021, 7 a.m. UTC | #2
On Tue, Oct 26, 2021 at 04:37:50PM +0000, Sean Christopherson wrote:
> On Fri, Oct 22, 2021, Hou Wenlong wrote:
> > When kvm.force_emulation_prefix is enabled, instruction with
> > kvm prefix would trigger an UD exception and do instruction
> > emulation. The emulation may need to exit to userspace due
> > to userspace io, and the complete_userspace_io callback may
> > skip instruction, i.e. MSR accesses emulation would exit to
> > userspace if userspace wanted to know about the MSR fault.
> > However, VM_EXIT_INSTRUCTION_LEN in vmcs is invalid now, it
> > should use kvm_emulate_instruction() to skip instruction.
> > 
> > Signed-off-by: Hou Wenlong <houwenlong93@linux.alibaba.com>
> > ---
> >  arch/x86/kvm/vmx/vmx.c | 4 ++--
> >  arch/x86/kvm/vmx/vmx.h | 9 +++++++++
> >  2 files changed, 11 insertions(+), 2 deletions(-)
> > 
> > diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> > index 1c8b2b6e7ed9..01049d65da26 100644
> > --- a/arch/x86/kvm/vmx/vmx.c
> > +++ b/arch/x86/kvm/vmx/vmx.c
> > @@ -1501,8 +1501,8 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
> >  	 * (namely Hyper-V) don't set it due to it being undefined behavior,
> >  	 * i.e. we end up advancing IP with some random value.
> >  	 */
> > -	if (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||
> > -	    exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) {
> > +	if (!is_ud_exit(vcpu) && (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||
> 
> This is incomplete and is just a workaround for the underlying bug.  The same
> mess can occur if the emulator triggers an exit to userspace during "normal"
> emulation, e.g. if unrestricted guest is disabled and the guest accesses an MSR
> while in Big RM.  In that case, there's no #UD to key off of.
> 
> The correct way to fix this is to attach a different callback when the MSR access
> comes from the emulator.  I'd say rename the existing complete_emulated_{rd,wr}msr()
> callbacks to complete_fast_{rd,wr}msr() to match the port I/O nomenclature.
> 
> Something like this (which also has some opportunistic simplification of the
> error handling in kvm_emulate_{rd,wr}msr()).
> 
> ---
>  arch/x86/kvm/x86.c | 82 +++++++++++++++++++++++++---------------------
>  1 file changed, 45 insertions(+), 37 deletions(-)
> 
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index ac83d873d65b..7ff5b8d58ca3 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -1814,18 +1814,44 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
>  }
>  EXPORT_SYMBOL_GPL(kvm_set_msr);
> 
> -static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu)
> +static void __complete_emulated_rdmsr(struct kvm_vcpu *vcpu)
>  {
> -	int err = vcpu->run->msr.error;
> -	if (!err) {
> +	if (!vcpu->run->msr.error) {
>  		kvm_rax_write(vcpu, (u32)vcpu->run->msr.data);
>  		kvm_rdx_write(vcpu, vcpu->run->msr.data >> 32);
>  	}
> +}
> 
> -	return static_call(kvm_x86_complete_emulated_msr)(vcpu, err);
> +static int complete_emulated_msr_access(struct kvm_vcpu *vcpu)
> +{
> +	if (vcpu->run->msr.error) {
> +		kvm_inject_gp(vcpu, 0);
> +		return 1;
> +	}
> +
> +	return kvm_emulate_instruction(vcpu, EMULTYPE_SKIP);
Lose single step checking after instruction skipping. It seems
we should skip single step checking in x86_emulate_instruction()
too for msr access. I don't understand `writeback` variable in
x86_emulate_instruction(), does it means emulation is broken and
some regs in emulation context haven't been updated to vcpu context?
Then in this case, I should set it to false in Patch 2? (although no
regs have been changed)

> +}
> +
> +static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu)
> +{
> +	__complete_emulated_rdmsr(vcpu);
> +
> +	return complete_emulated_msr_access(vcpu);
>  }
> 
>  static int complete_emulated_wrmsr(struct kvm_vcpu *vcpu)
> +{
> +	return complete_emulated_msr_access(vcpu);
> +}
> +
> +static int complete_fast_rdmsr(struct kvm_vcpu *vcpu)
> +{
> +	__complete_emulated_rdmsr(vcpu);
> +
> +	return static_call(kvm_x86_complete_emulated_msr)(vcpu, vcpu->run->msr.error);
> +}
> +
> +static int complete_fast_wrmsr(struct kvm_vcpu *vcpu)
>  {
>  	return static_call(kvm_x86_complete_emulated_msr)(vcpu, vcpu->run->msr.error);
>  }
> @@ -1864,18 +1890,6 @@ static int kvm_msr_user_space(struct kvm_vcpu *vcpu, u32 index,
>  	return 1;
>  }
> 
> -static int kvm_get_msr_user_space(struct kvm_vcpu *vcpu, u32 index, int r)
> -{
> -	return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_RDMSR, 0,
> -				   complete_emulated_rdmsr, r);
> -}
> -
> -static int kvm_set_msr_user_space(struct kvm_vcpu *vcpu, u32 index, u64 data, int r)
> -{
> -	return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_WRMSR, data,
> -				   complete_emulated_wrmsr, r);
> -}
> -
>  int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu)
>  {
>  	u32 ecx = kvm_rcx_read(vcpu);
> @@ -1883,19 +1897,15 @@ int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu)
>  	int r;
> 
>  	r = kvm_get_msr(vcpu, ecx, &data);
> -
> -	/* MSR read failed? See if we should ask user space */
> -	if (r && kvm_get_msr_user_space(vcpu, ecx, r)) {
> -		/* Bounce to user space */
> -		return 0;
> -	}
> -
>  	if (!r) {
>  		trace_kvm_msr_read(ecx, data);
> 
>  		kvm_rax_write(vcpu, data & -1u);
>  		kvm_rdx_write(vcpu, (data >> 32) & -1u);
>  	} else {
> +		if (kvm_msr_user_space(vcpu, ecx, KVM_EXIT_X86_RDMSR, 0,
> +				       complete_fast_rdmsr, r))
> +			return 0;
>  		trace_kvm_msr_read_ex(ecx);
>  	}
> 
> @@ -1910,20 +1920,16 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
>  	int r;
> 
>  	r = kvm_set_msr(vcpu, ecx, data);
> -
> -	/* MSR write failed? See if we should ask user space */
> -	if (r && kvm_set_msr_user_space(vcpu, ecx, data, r))
> -		/* Bounce to user space */
> -		return 0;
> -
> -	/* Signal all other negative errors to userspace */
> -	if (r < 0)
> -		return r;
> -
> -	if (!r)
> +	if (!r) {
>  		trace_kvm_msr_write(ecx, data);
> -	else
> +	} else {
> +		if (kvm_msr_user_space(vcpu, ecx, KVM_EXIT_X86_WRMSR, data,
> +				       complete_fast_wrmsr, r))
> +			return 0;
> +		if (r < 0)
> +			return r;
>  		trace_kvm_msr_write_ex(ecx, data);
> +	}
> 
>  	return static_call(kvm_x86_complete_emulated_msr)(vcpu, r);
>  }
> @@ -7387,7 +7393,8 @@ static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
> 
>  	r = kvm_get_msr(vcpu, msr_index, pdata);
> 
> -	if (r && kvm_get_msr_user_space(vcpu, msr_index, r)) {
> +	if (r && kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_RDMSR, 0,
> +				    complete_emulated_rdmsr, r)) {
>  		/* Bounce to user space */
>  		return X86EMUL_IO_NEEDED;
>  	}
> @@ -7403,7 +7410,8 @@ static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
> 
>  	r = kvm_set_msr(vcpu, msr_index, data);
> 
> -	if (r && kvm_set_msr_user_space(vcpu, msr_index, data, r)) {
> +	if (r && kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_WRMSR, data,
> +				    complete_emulated_wrmsr, r)) {
>  		/* Bounce to user space */
>  		return X86EMUL_IO_NEEDED;
>  	}
> --
Hou Wenlong Oct. 29, 2021, 10:57 a.m. UTC | #3
On Tue, Oct 26, 2021 at 04:37:50PM +0000, Sean Christopherson wrote:
> On Fri, Oct 22, 2021, Hou Wenlong wrote:
> > When kvm.force_emulation_prefix is enabled, instruction with
> > kvm prefix would trigger an UD exception and do instruction
> > emulation. The emulation may need to exit to userspace due
> > to userspace io, and the complete_userspace_io callback may
> > skip instruction, i.e. MSR accesses emulation would exit to
> > userspace if userspace wanted to know about the MSR fault.
> > However, VM_EXIT_INSTRUCTION_LEN in vmcs is invalid now, it
> > should use kvm_emulate_instruction() to skip instruction.
> > 
> > Signed-off-by: Hou Wenlong <houwenlong93@linux.alibaba.com>
> > ---
> >  arch/x86/kvm/vmx/vmx.c | 4 ++--
> >  arch/x86/kvm/vmx/vmx.h | 9 +++++++++
> >  2 files changed, 11 insertions(+), 2 deletions(-)
> > 
> > diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> > index 1c8b2b6e7ed9..01049d65da26 100644
> > --- a/arch/x86/kvm/vmx/vmx.c
> > +++ b/arch/x86/kvm/vmx/vmx.c
> > @@ -1501,8 +1501,8 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
> >  	 * (namely Hyper-V) don't set it due to it being undefined behavior,
> >  	 * i.e. we end up advancing IP with some random value.
> >  	 */
> > -	if (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||
> > -	    exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) {
> > +	if (!is_ud_exit(vcpu) && (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||
> 
> This is incomplete and is just a workaround for the underlying bug.  The same
> mess can occur if the emulator triggers an exit to userspace during "normal"
> emulation, e.g. if unrestricted guest is disabled and the guest accesses an MSR
> while in Big RM.  In that case, there's no #UD to key off of.
> 
> The correct way to fix this is to attach a different callback when the MSR access
> comes from the emulator.  I'd say rename the existing complete_emulated_{rd,wr}msr()
> callbacks to complete_fast_{rd,wr}msr() to match the port I/O nomenclature.
> 
> Something like this (which also has some opportunistic simplification of the
> error handling in kvm_emulate_{rd,wr}msr()).
> 
> ---
>  arch/x86/kvm/x86.c | 82 +++++++++++++++++++++++++---------------------
>  1 file changed, 45 insertions(+), 37 deletions(-)
> 
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index ac83d873d65b..7ff5b8d58ca3 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -1814,18 +1814,44 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
>  }
>  EXPORT_SYMBOL_GPL(kvm_set_msr);
> 
> -static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu)
> +static void __complete_emulated_rdmsr(struct kvm_vcpu *vcpu)
>  {
> -	int err = vcpu->run->msr.error;
> -	if (!err) {
> +	if (!vcpu->run->msr.error) {
>  		kvm_rax_write(vcpu, (u32)vcpu->run->msr.data);
>  		kvm_rdx_write(vcpu, vcpu->run->msr.data >> 32);
>  	}
> +}
> 
> -	return static_call(kvm_x86_complete_emulated_msr)(vcpu, err);
> +static int complete_emulated_msr_access(struct kvm_vcpu *vcpu)
> +{
> +	if (vcpu->run->msr.error) {
> +		kvm_inject_gp(vcpu, 0);
> +		return 1;
> +	}
> +
> +	return kvm_emulate_instruction(vcpu, EMULTYPE_SKIP);
> +}
> +
> +static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu)
> +{
> +	__complete_emulated_rdmsr(vcpu);
> +
> +	return complete_emulated_msr_access(vcpu);
>  }
> 
>  static int complete_emulated_wrmsr(struct kvm_vcpu *vcpu)
> +{
> +	return complete_emulated_msr_access(vcpu);
> +}
> +
> +static int complete_fast_rdmsr(struct kvm_vcpu *vcpu)
> +{
> +	__complete_emulated_rdmsr(vcpu);
> +
> +	return static_call(kvm_x86_complete_emulated_msr)(vcpu, vcpu->run->msr.error);
> +}
> +
> +static int complete_fast_wrmsr(struct kvm_vcpu *vcpu)
>  {
>  	return static_call(kvm_x86_complete_emulated_msr)(vcpu, vcpu->run->msr.error);
>  }
> @@ -1864,18 +1890,6 @@ static int kvm_msr_user_space(struct kvm_vcpu *vcpu, u32 index,
>  	return 1;
>  }
> 
> -static int kvm_get_msr_user_space(struct kvm_vcpu *vcpu, u32 index, int r)
> -{
> -	return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_RDMSR, 0,
> -				   complete_emulated_rdmsr, r);
> -}
> -
> -static int kvm_set_msr_user_space(struct kvm_vcpu *vcpu, u32 index, u64 data, int r)
> -{
> -	return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_WRMSR, data,
> -				   complete_emulated_wrmsr, r);
> -}
> -
>  int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu)
>  {
>  	u32 ecx = kvm_rcx_read(vcpu);
> @@ -1883,19 +1897,15 @@ int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu)
>  	int r;
> 
>  	r = kvm_get_msr(vcpu, ecx, &data);
> -
> -	/* MSR read failed? See if we should ask user space */
> -	if (r && kvm_get_msr_user_space(vcpu, ecx, r)) {
> -		/* Bounce to user space */
> -		return 0;
> -	}
> -
>  	if (!r) {
>  		trace_kvm_msr_read(ecx, data);
> 
>  		kvm_rax_write(vcpu, data & -1u);
>  		kvm_rdx_write(vcpu, (data >> 32) & -1u);
>  	} else {
> +		if (kvm_msr_user_space(vcpu, ecx, KVM_EXIT_X86_RDMSR, 0,
> +				       complete_fast_rdmsr, r))
> +			return 0;
>  		trace_kvm_msr_read_ex(ecx);
>  	}
> 
> @@ -1910,20 +1920,16 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
>  	int r;
> 
>  	r = kvm_set_msr(vcpu, ecx, data);
> -
> -	/* MSR write failed? See if we should ask user space */
> -	if (r && kvm_set_msr_user_space(vcpu, ecx, data, r))
> -		/* Bounce to user space */
> -		return 0;
> -
> -	/* Signal all other negative errors to userspace */
> -	if (r < 0)
> -		return r;
> -
> -	if (!r)
> +	if (!r) {
>  		trace_kvm_msr_write(ecx, data);
> -	else
> +	} else {
> +		if (kvm_msr_user_space(vcpu, ecx, KVM_EXIT_X86_WRMSR, data,
> +				       complete_fast_wrmsr, r))
> +			return 0;
> +		if (r < 0)
> +			return r;
>  		trace_kvm_msr_write_ex(ecx, data);
> +	}
> 
>  	return static_call(kvm_x86_complete_emulated_msr)(vcpu, r);
>  }
> @@ -7387,7 +7393,8 @@ static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
> 
>  	r = kvm_get_msr(vcpu, msr_index, pdata);
> 
> -	if (r && kvm_get_msr_user_space(vcpu, msr_index, r)) {
> +	if (r && kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_RDMSR, 0,
> +				    complete_emulated_rdmsr, r)) {
>  		/* Bounce to user space */
>  		return X86EMUL_IO_NEEDED;
>  	}
> @@ -7403,7 +7410,8 @@ static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
> 
>  	r = kvm_set_msr(vcpu, msr_index, data);
> 
> -	if (r && kvm_set_msr_user_space(vcpu, msr_index, data, r)) {
> +	if (r && kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_WRMSR, data,
> +				    complete_emulated_wrmsr, r)) {
>  		/* Bounce to user space */
>  		return X86EMUL_IO_NEEDED;
>  	}
> --
Hi Sean,

The note in x86_emulate_instruction() for EMULTYPE_SKIP said that the
caller should be responsible for updating interruptibility state and
injecting single-step #DB. And vendor callbacks for
kvm_skip_emulated_instruction() also do some special things, e.g.
I found that sev_es guest just skips RIP updating. So it may be
more appropriate to add a parameter for skip_emulated_instruction()
callback, which force to use x86_skip_instruction() if the instruction
length is invalid.

Thanks
Sean Christopherson Nov. 1, 2021, 5:03 p.m. UTC | #4
On Fri, Oct 29, 2021, Hou Wenlong wrote:
> On Tue, Oct 26, 2021 at 04:37:50PM +0000, Sean Christopherson wrote:
> > On Fri, Oct 22, 2021, Hou Wenlong wrote:
> > +static int complete_emulated_msr_access(struct kvm_vcpu *vcpu)
> > +{
> > +	if (vcpu->run->msr.error) {
> > +		kvm_inject_gp(vcpu, 0);
> > +		return 1;
> > +	}
> > +
> > +	return kvm_emulate_instruction(vcpu, EMULTYPE_SKIP);
> > +}

...

> The note in x86_emulate_instruction() for EMULTYPE_SKIP said that the
> caller should be responsible for updating interruptibility state and
> injecting single-step #DB.

Urgh, yes.  And that note also very clear states it's for use only by the vendor
callbacks for exactly that reason.

> And vendor callbacks for kvm_skip_emulated_instruction() also do some special
> things,

Luckily, the emulator also does (almost) all those special things.

> e.g. I found that sev_es guest just skips RIP updating.

Emulation is impossible with sev_es because KVM can't decode the guest code stream,
so that particular wrinkle is out of scope.

> So it may be more appropriate to add a parameter for skip_emulated_instruction()
> callback, which force to use x86_skip_instruction() if the instruction length
> is invalid.

I really don't like the idea of routing this through kvm_skip_emulated_instruction(),
anything originating from the emulator ideally would be handled within the emulator
when possible, especially since we know that KVM is going to end up in the emulator
anyways.

The best idea I can come up with is to add a new emulation type to pair with _SKIP
to handle completion of user exits.  In theory it should be a tiny code change to
add a branch inside the EMULTYPE_SKIP path.

On a related topic, I think EMULTYPE_SKIP fails to handled wrapping EIP when the
guest has a flat code segment.

So this:

From e3511669c40e4d074fb19f43256fc5da8634af14 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Mon, 1 Nov 2021 09:52:35 -0700
Subject: [PATCH] KVM: x86: Handle 32-bit wrap of EIP for EMULTYPE_SKIP with
 flat code seg

Truncate the new EIP to a 32-bit value when handling EMULTYPE_SKIP as the
decode phase does not truncate _eip.  Wwrapping the 32-bit boundary is
legal if and only if CS is a flat code segment, but that check is
implicitly handled in the form of limit checks in the decode phase.

Opportunstically prepare for a future fix by storing the result of any
truncation in "eip" instead of "_eip".

Fixes: 1957aa63be53 ("KVM: VMX: Handle single-step #DB for EMULTYPE_SKIP on EPT misconfig")
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/x86.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ac83d873d65b..3d7fc5c21ceb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -8124,7 +8124,12 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 	 * updating interruptibility state and injecting single-step #DBs.
 	 */
 	if (emulation_type & EMULTYPE_SKIP) {
-		kvm_rip_write(vcpu, ctxt->_eip);
+		if (ctxt->mode != X86EMUL_MODE_PROT64)
+			ctxt->eip = (u32)ctxt->_eip;
+		else
+			ctxt->eip = ctxt->_eip;
+
+		kvm_rip_write(vcpu, ctxt->eip);
 		if (ctxt->eflags & X86_EFLAGS_RF)
 			kvm_set_rflags(vcpu, ctxt->eflags & ~X86_EFLAGS_RF);
 		return 1;
--


followed by the rework with complete_emulated_msr_access() doing
"EMULTYPE_SKIP | EMULTYPE_COMPLETE_USER_EXIT" with this as the functional change
in the emulator:

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3d7fc5c21ceb..13d4758810d1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -8118,10 +8118,12 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
                return 1;
        }

+
        /*
-        * Note, EMULTYPE_SKIP is intended for use *only* by vendor callbacks
-        * for kvm_skip_emulated_instruction().  The caller is responsible for
-        * updating interruptibility state and injecting single-step #DBs.
+        * EMULTYPE_SKIP without is EMULTYPE_COMPLETE_USER_EXIT intended for
+        * use *only* by vendor callbacks for kvm_skip_emulated_instruction().
+        * The caller is responsible for updating interruptibility state and
+        * injecting single-step #DBs.
         */
        if (emulation_type & EMULTYPE_SKIP) {
                if (ctxt->mode != X86EMUL_MODE_PROT64)
@@ -8129,6 +8131,9 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
                else
                        ctxt->eip = ctxt->_eip;

+               if (emulation_type & EMULTYPE_COMPLETE_USER_EXIT)
+                       goto writeback;
+
                kvm_rip_write(vcpu, ctxt->eip);
                if (ctxt->eflags & X86_EFLAGS_RF)
                        kvm_set_rflags(vcpu, ctxt->eflags & ~X86_EFLAGS_RF);
@@ -8198,6 +8203,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
        else
                r = 1;

+writeback:
        if (writeback) {
                unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu);
                toggle_interruptibility(vcpu, ctxt->interruptibility);
diff mbox series

Patch

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 1c8b2b6e7ed9..01049d65da26 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1501,8 +1501,8 @@  static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
 	 * (namely Hyper-V) don't set it due to it being undefined behavior,
 	 * i.e. we end up advancing IP with some random value.
 	 */
-	if (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||
-	    exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) {
+	if (!is_ud_exit(vcpu) && (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||
+	    exit_reason.basic != EXIT_REASON_EPT_MISCONFIG)) {
 		instr_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
 
 		/*
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 592217fd7d92..e7a7f580acd1 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -481,6 +481,15 @@  static inline u32 vmx_get_intr_info(struct kvm_vcpu *vcpu)
 	return vmx->exit_intr_info;
 }
 
+static inline bool is_ud_exit(struct kvm_vcpu *vcpu)
+{
+	union vmx_exit_reason exit_reason = to_vmx(vcpu)->exit_reason;
+	u32 intr_info = vmx_get_intr_info(vcpu);
+
+	return exit_reason.basic == EXIT_REASON_EXCEPTION_NMI &&
+	       is_invalid_opcode(intr_info);
+}
+
 struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags);
 void free_vmcs(struct vmcs *vmcs);
 int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs);