diff mbox series

[3/5] kvm: x86: Defer setting of CR2 until #PF delivery

Message ID 20181008182945.79957-3-jmattson@google.com (mailing list archive)
State New, archived
Headers show
Series [1/5] kvm: x86: Add payload to kvm_queued_exception and kvm_vcpu_events | expand

Commit Message

Jim Mattson Oct. 8, 2018, 6:29 p.m. UTC
When exception payloads are enabled by userspace (which is not yet
possible) and a #PF is raised in L2, defer the setting of CR2 until
the #PF is delivered. This allows the L1 hypervisor to intercept the
fault before CR2 is modified.

For backwards compatibility, when exception payloads are not enabled
by userspace, kvm_multiple_exception modifies CR2 when the #PF
exception is raised.

Reported-by: Jim Mattson <jmattson@google.com>
Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Jim Mattson <jmattson@google.com>
Reviewed-by: Peter Shier <pshier@google.com>
---
 arch/x86/kvm/svm.c | 13 ++++++-------
 arch/x86/kvm/vmx.c | 19 +++++++++----------
 arch/x86/kvm/x86.c | 40 ++++++++++++++++++++++++++++++++++++----
 arch/x86/kvm/x86.h |  2 ++
 4 files changed, 53 insertions(+), 21 deletions(-)

Comments

Liran Alon Oct. 9, 2018, 11:57 a.m. UTC | #1
> On 8 Oct 2018, at 21:29, Jim Mattson <jmattson@google.com> wrote:
> 
> When exception payloads are enabled by userspace (which is not yet
> possible) and a #PF is raised in L2, defer the setting of CR2 until
> the #PF is delivered. This allows the L1 hypervisor to intercept the
> fault before CR2 is modified.
> 
> For backwards compatibility, when exception payloads are not enabled
> by userspace, kvm_multiple_exception modifies CR2 when the #PF
> exception is raised.
> 
> Reported-by: Jim Mattson <jmattson@google.com>
> Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
> Signed-off-by: Jim Mattson <jmattson@google.com>
> Reviewed-by: Peter Shier <pshier@google.com>
> ---
> arch/x86/kvm/svm.c | 13 ++++++-------
> arch/x86/kvm/vmx.c | 19 +++++++++----------
> arch/x86/kvm/x86.c | 40 ++++++++++++++++++++++++++++++++++++----
> arch/x86/kvm/x86.h |  2 ++
> 4 files changed, 53 insertions(+), 21 deletions(-)
> 
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index d96092b35936..8b05a84cf8f2 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -805,6 +805,8 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu)
> 	    nested_svm_check_exception(svm, nr, has_error_code, error_code))
> 		return;
> 
> +	kvm_deliver_exception_payload(&svm->vcpu);
> +
> 	if (nr == BP_VECTOR && !static_cpu_has(X86_FEATURE_NRIPS)) {
> 		unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);
> 
> @@ -2965,16 +2967,13 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
> 	svm->vmcb->control.exit_info_1 = error_code;
> 
> 	/*
> -	 * FIXME: we should not write CR2 when L1 intercepts an L2 #PF exception.
> -	 * The fix is to add the ancillary datum (CR2 or DR6) to structs
> -	 * kvm_queued_exception and kvm_vcpu_events, so that CR2 and DR6 can be
> -	 * written only when inject_pending_event runs (DR6 would written here
> -	 * too).  This should be conditional on a new capability---if the
> -	 * capability is disabled, kvm_multiple_exception would write the
> -	 * ancillary information to CR2 or DR6, for backwards ABI-compatibility.
> +	 * EXITINFO2 is undefined for all exception intercepts other
> +	 * than #PF.
> 	 */
> 	if (svm->vcpu.arch.exception.nested_apf)
> 		svm->vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token;
> +	else if (svm->vcpu.arch.exception.has_payload)
> +		svm->vmcb->control.exit_info_2 = svm->vcpu.arch.exception.payload;
> 	else
> 		svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
> 
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 06412ba46aa3..fc3f2d27b580 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -3275,27 +3275,24 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit
> {
> 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
> 	unsigned int nr = vcpu->arch.exception.nr;
> +	bool has_payload = vcpu->arch.exception.has_payload;
> +	unsigned long payload = vcpu->arch.exception.payload;
> 
> 	if (nr == PF_VECTOR) {
> 		if (vcpu->arch.exception.nested_apf) {
> 			*exit_qual = vcpu->arch.apf.nested_apf_token;
> 			return 1;
> 		}
> -		/*
> -		 * FIXME: we must not write CR2 when L1 intercepts an L2 #PF exception.
> -		 * The fix is to add the ancillary datum (CR2 or DR6) to structs
> -		 * kvm_queued_exception and kvm_vcpu_events, so that CR2 and DR6
> -		 * can be written only when inject_pending_event runs.  This should be
> -		 * conditional on a new capability---if the capability is disabled,
> -		 * kvm_multiple_exception would write the ancillary information to
> -		 * CR2 or DR6, for backwards ABI-compatibility.
> -		 */
> 		if (nested_vmx_is_page_fault_vmexit(vmcs12,
> 						    vcpu->arch.exception.error_code)) {
> -			*exit_qual = vcpu->arch.cr2;
> +			*exit_qual = has_payload ? payload : vcpu->arch.cr2;
> 			return 1;
> 		}
> 	} else {
> +		/*
> +		 * FIXME: we must not write DR6 when L1 intercepts an
> +		 * L2 #DB exception.
> +		 */
> 		if (vmcs12->exception_bitmap & (1u << nr)) {
> 			if (nr == DB_VECTOR)
> 				*exit_qual = vcpu->arch.dr6;
> @@ -3329,6 +3326,8 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu)
> 	u32 error_code = vcpu->arch.exception.error_code;
> 	u32 intr_info = nr | INTR_INFO_VALID_MASK;
> 
> +	kvm_deliver_exception_payload(vcpu);
> +
> 	if (has_error_code) {
> 		vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
> 		intr_info |= INTR_INFO_DELIVER_CODE_MASK;
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 7c4e6845fe80..974f0784ac99 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -400,6 +400,26 @@ static int exception_type(int vector)
> 	return EXCPT_FAULT;
> }
> 
> +void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu)
> +{
> +	unsigned nr = vcpu->arch.exception.nr;
> +	unsigned long has_payload = vcpu->arch.exception.has_payload;

has_payload should be defines as “bool”.

> +	unsigned long payload = vcpu->arch.exception.payload;
> +
> +	if (!has_payload)
> +		return;
> +
> +	switch (nr) {
> +	case PF_VECTOR:
> +		vcpu->arch.cr2 = payload;
> +		break;
> +	}
> +
> +	vcpu->arch.exception.has_payload = false;
> +	vcpu->arch.exception.payload = 0;
> +}
> +EXPORT_SYMBOL_GPL(kvm_deliver_exception_payload);
> +
> static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
> 		unsigned nr, bool has_error, u32 error_code,
> 	        bool has_payload, unsigned long payload, bool reinject)
> @@ -433,6 +453,9 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
> 		vcpu->arch.exception.error_code = error_code;
> 		vcpu->arch.exception.has_payload = has_payload;
> 		vcpu->arch.exception.payload = payload;
> +		if (!vcpu->kvm->arch.exception_payload_enabled ||
> +		    !is_guest_mode(vcpu))
> +			kvm_deliver_exception_payload(vcpu);

I would add a comment in code here explaining why we defer exception payload when in guest-mode.
I don’t think it should only be described in relevant commit messages.

> 		return;
> 	}
> 
> @@ -478,6 +501,13 @@ void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
> }
> EXPORT_SYMBOL_GPL(kvm_requeue_exception);
> 
> +static void kvm_queue_exception_e_p(struct kvm_vcpu *vcpu, unsigned nr,
> +				    u32 error_code, unsigned long payload)
> +{
> +	kvm_multiple_exception(vcpu, nr, true, error_code,
> +			       true, payload, false);
> +}
> +
> int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
> {
> 	if (err)
> @@ -494,11 +524,13 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
> 	++vcpu->stat.pf_guest;
> 	vcpu->arch.exception.nested_apf =
> 		is_guest_mode(vcpu) && fault->async_page_fault;
> -	if (vcpu->arch.exception.nested_apf)
> +	if (vcpu->arch.exception.nested_apf) {
> 		vcpu->arch.apf.nested_apf_token = fault->address;
> -	else
> -		vcpu->arch.cr2 = fault->address;
> -	kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
> +		kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
> +	} else {
> +		kvm_queue_exception_e_p(vcpu, PF_VECTOR, fault->error_code,
> +					fault->address);
> +	}
> }
> EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
> 
> diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
> index 67b9568613f3..224cd0a47568 100644
> --- a/arch/x86/kvm/x86.h
> +++ b/arch/x86/kvm/x86.h
> @@ -266,6 +266,8 @@ int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu,
> 
> int handle_ud(struct kvm_vcpu *vcpu);
> 
> +void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu);
> +
> void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu);
> u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
> bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data);
> -- 
> 2.19.0.605.g01d371f741-goog
> 

Looks good to me.
Reviewed-by: Liran Alon <liran.alon@oracle.com>
diff mbox series

Patch

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index d96092b35936..8b05a84cf8f2 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -805,6 +805,8 @@  static void svm_queue_exception(struct kvm_vcpu *vcpu)
 	    nested_svm_check_exception(svm, nr, has_error_code, error_code))
 		return;
 
+	kvm_deliver_exception_payload(&svm->vcpu);
+
 	if (nr == BP_VECTOR && !static_cpu_has(X86_FEATURE_NRIPS)) {
 		unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);
 
@@ -2965,16 +2967,13 @@  static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
 	svm->vmcb->control.exit_info_1 = error_code;
 
 	/*
-	 * FIXME: we should not write CR2 when L1 intercepts an L2 #PF exception.
-	 * The fix is to add the ancillary datum (CR2 or DR6) to structs
-	 * kvm_queued_exception and kvm_vcpu_events, so that CR2 and DR6 can be
-	 * written only when inject_pending_event runs (DR6 would written here
-	 * too).  This should be conditional on a new capability---if the
-	 * capability is disabled, kvm_multiple_exception would write the
-	 * ancillary information to CR2 or DR6, for backwards ABI-compatibility.
+	 * EXITINFO2 is undefined for all exception intercepts other
+	 * than #PF.
 	 */
 	if (svm->vcpu.arch.exception.nested_apf)
 		svm->vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token;
+	else if (svm->vcpu.arch.exception.has_payload)
+		svm->vmcb->control.exit_info_2 = svm->vcpu.arch.exception.payload;
 	else
 		svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 06412ba46aa3..fc3f2d27b580 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3275,27 +3275,24 @@  static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit
 {
 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 	unsigned int nr = vcpu->arch.exception.nr;
+	bool has_payload = vcpu->arch.exception.has_payload;
+	unsigned long payload = vcpu->arch.exception.payload;
 
 	if (nr == PF_VECTOR) {
 		if (vcpu->arch.exception.nested_apf) {
 			*exit_qual = vcpu->arch.apf.nested_apf_token;
 			return 1;
 		}
-		/*
-		 * FIXME: we must not write CR2 when L1 intercepts an L2 #PF exception.
-		 * The fix is to add the ancillary datum (CR2 or DR6) to structs
-		 * kvm_queued_exception and kvm_vcpu_events, so that CR2 and DR6
-		 * can be written only when inject_pending_event runs.  This should be
-		 * conditional on a new capability---if the capability is disabled,
-		 * kvm_multiple_exception would write the ancillary information to
-		 * CR2 or DR6, for backwards ABI-compatibility.
-		 */
 		if (nested_vmx_is_page_fault_vmexit(vmcs12,
 						    vcpu->arch.exception.error_code)) {
-			*exit_qual = vcpu->arch.cr2;
+			*exit_qual = has_payload ? payload : vcpu->arch.cr2;
 			return 1;
 		}
 	} else {
+		/*
+		 * FIXME: we must not write DR6 when L1 intercepts an
+		 * L2 #DB exception.
+		 */
 		if (vmcs12->exception_bitmap & (1u << nr)) {
 			if (nr == DB_VECTOR)
 				*exit_qual = vcpu->arch.dr6;
@@ -3329,6 +3326,8 @@  static void vmx_queue_exception(struct kvm_vcpu *vcpu)
 	u32 error_code = vcpu->arch.exception.error_code;
 	u32 intr_info = nr | INTR_INFO_VALID_MASK;
 
+	kvm_deliver_exception_payload(vcpu);
+
 	if (has_error_code) {
 		vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
 		intr_info |= INTR_INFO_DELIVER_CODE_MASK;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7c4e6845fe80..974f0784ac99 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -400,6 +400,26 @@  static int exception_type(int vector)
 	return EXCPT_FAULT;
 }
 
+void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu)
+{
+	unsigned nr = vcpu->arch.exception.nr;
+	unsigned long has_payload = vcpu->arch.exception.has_payload;
+	unsigned long payload = vcpu->arch.exception.payload;
+
+	if (!has_payload)
+		return;
+
+	switch (nr) {
+	case PF_VECTOR:
+		vcpu->arch.cr2 = payload;
+		break;
+	}
+
+	vcpu->arch.exception.has_payload = false;
+	vcpu->arch.exception.payload = 0;
+}
+EXPORT_SYMBOL_GPL(kvm_deliver_exception_payload);
+
 static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
 		unsigned nr, bool has_error, u32 error_code,
 	        bool has_payload, unsigned long payload, bool reinject)
@@ -433,6 +453,9 @@  static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
 		vcpu->arch.exception.error_code = error_code;
 		vcpu->arch.exception.has_payload = has_payload;
 		vcpu->arch.exception.payload = payload;
+		if (!vcpu->kvm->arch.exception_payload_enabled ||
+		    !is_guest_mode(vcpu))
+			kvm_deliver_exception_payload(vcpu);
 		return;
 	}
 
@@ -478,6 +501,13 @@  void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
 }
 EXPORT_SYMBOL_GPL(kvm_requeue_exception);
 
+static void kvm_queue_exception_e_p(struct kvm_vcpu *vcpu, unsigned nr,
+				    u32 error_code, unsigned long payload)
+{
+	kvm_multiple_exception(vcpu, nr, true, error_code,
+			       true, payload, false);
+}
+
 int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
 {
 	if (err)
@@ -494,11 +524,13 @@  void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
 	++vcpu->stat.pf_guest;
 	vcpu->arch.exception.nested_apf =
 		is_guest_mode(vcpu) && fault->async_page_fault;
-	if (vcpu->arch.exception.nested_apf)
+	if (vcpu->arch.exception.nested_apf) {
 		vcpu->arch.apf.nested_apf_token = fault->address;
-	else
-		vcpu->arch.cr2 = fault->address;
-	kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
+		kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
+	} else {
+		kvm_queue_exception_e_p(vcpu, PF_VECTOR, fault->error_code,
+					fault->address);
+	}
 }
 EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
 
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 67b9568613f3..224cd0a47568 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -266,6 +266,8 @@  int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu,
 
 int handle_ud(struct kvm_vcpu *vcpu);
 
+void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu);
+
 void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu);
 u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
 bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data);