diff mbox

KVM: x86: fix singlestepping over syscall

Message ID 20170622151040.16231-1-rkrcmar@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Radim Krčmář June 22, 2017, 3:10 p.m. UTC
From: Paolo Bonzini <pbonzini@redhat.com>

TF is handled a bit differently for syscall and sysret, compared
to the other instructions: TF is checked after the instruction completes,
so that the OS can disable #DB at a syscall by adding TF to FMASK.
When the sysret is executed the #DB is taken "as if" the syscall insn
just completed.

KVM emulates syscall so that it can trap 32-bit syscall on Intel processors.
Fix the behavior, otherwise you could get #DB on a user stack which is not
nice.  This does not affect Linux guests, as they use an IST or task gate
for #DB.

This fixes CVE-2017-7518.

Cc: stable@vger.kernel.org
Reported-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 The patch is included in the following pull request to Linus.

 arch/x86/include/asm/kvm_emulate.h |  1 +
 arch/x86/kvm/emulate.c             |  1 +
 arch/x86/kvm/x86.c                 | 62 ++++++++++++++++++++------------------
 3 files changed, 34 insertions(+), 30 deletions(-)

Comments

Wanpeng Li June 27, 2017, 3:41 a.m. UTC | #1
2017-06-22 23:10 GMT+08:00 Radim Krčmář <rkrcmar@redhat.com>:
> From: Paolo Bonzini <pbonzini@redhat.com>
>
> TF is handled a bit differently for syscall and sysret, compared
> to the other instructions: TF is checked after the instruction completes,
> so that the OS can disable #DB at a syscall by adding TF to FMASK.
> When the sysret is executed the #DB is taken "as if" the syscall insn
> just completed.
>
> KVM emulates syscall so that it can trap 32-bit syscall on Intel processors.

We have a discussion to not expose syscall/sysret to Intel 32-bit
guest two years ago. https://lkml.org/lkml/2015/11/19/225 The
syscall/sysret just makes sense against long mode instead of
compatibility/legacy mode of Intel CPU. We will get a #UD in 32-bit
guest, and syscall emulation is introduced by commit 66bb2ccd (KVM:
x86 emulator: add syscall emulation) to handle it. So why we still
expose syscall/sysret to Intel 32-bit guest?

> Fix the behavior, otherwise you could get #DB on a user stack which is not
> nice.  This does not affect Linux guests, as they use an IST or task gate
> for #DB.
>
> This fixes CVE-2017-7518.
>
> Cc: stable@vger.kernel.org
> Reported-by: Andy Lutomirski <luto@kernel.org>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
> ---
>  The patch is included in the following pull request to Linus.
>
>  arch/x86/include/asm/kvm_emulate.h |  1 +
>  arch/x86/kvm/emulate.c             |  1 +
>  arch/x86/kvm/x86.c                 | 62 ++++++++++++++++++++------------------
>  3 files changed, 34 insertions(+), 30 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
> index 055962615779..722d0e568863 100644
> --- a/arch/x86/include/asm/kvm_emulate.h
> +++ b/arch/x86/include/asm/kvm_emulate.h
> @@ -296,6 +296,7 @@ struct x86_emulate_ctxt {
>
>         bool perm_ok; /* do not check permissions if true */
>         bool ud;        /* inject an #UD if host doesn't support insn */
> +       bool tf;        /* TF value before instruction (after for syscall/sysret) */
>
>         bool have_exception;
>         struct x86_exception exception;
> diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
> index 0816ab2e8adc..80890dee66ce 100644
> --- a/arch/x86/kvm/emulate.c
> +++ b/arch/x86/kvm/emulate.c
> @@ -2742,6 +2742,7 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt)
>                 ctxt->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IF);
>         }
>
> +       ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0;
>         return X86EMUL_CONTINUE;
>  }
>
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 87d3cb901935..0e846f0cb83b 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -5313,6 +5313,8 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
>         kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
>
>         ctxt->eflags = kvm_get_rflags(vcpu);
> +       ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0;
> +

I guess this is used for "the sysret is executed the #DB is taken "as
if" the syscall insn just completed", however, there is no sysret
emulation, so how the #DB is taken after the sysret?

Regards,
Wanpeng Li

>         ctxt->eip = kvm_rip_read(vcpu);
>         ctxt->mode = (!is_protmode(vcpu))               ? X86EMUL_MODE_REAL :
>                      (ctxt->eflags & X86_EFLAGS_VM)     ? X86EMUL_MODE_VM86 :
> @@ -5528,36 +5530,25 @@ static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
>         return dr6;
>  }
>
> -static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, unsigned long rflags, int *r)
> +static void kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu, int *r)
>  {
>         struct kvm_run *kvm_run = vcpu->run;
>
> -       /*
> -        * rflags is the old, "raw" value of the flags.  The new value has
> -        * not been saved yet.
> -        *
> -        * This is correct even for TF set by the guest, because "the
> -        * processor will not generate this exception after the instruction
> -        * that sets the TF flag".
> -        */
> -       if (unlikely(rflags & X86_EFLAGS_TF)) {
> -               if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
> -                       kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 |
> -                                                 DR6_RTM;
> -                       kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip;
> -                       kvm_run->debug.arch.exception = DB_VECTOR;
> -                       kvm_run->exit_reason = KVM_EXIT_DEBUG;
> -                       *r = EMULATE_USER_EXIT;
> -               } else {
> -                       /*
> -                        * "Certain debug exceptions may clear bit 0-3.  The
> -                        * remaining contents of the DR6 register are never
> -                        * cleared by the processor".
> -                        */
> -                       vcpu->arch.dr6 &= ~15;
> -                       vcpu->arch.dr6 |= DR6_BS | DR6_RTM;
> -                       kvm_queue_exception(vcpu, DB_VECTOR);
> -               }
> +       if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
> +               kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 | DR6_RTM;
> +               kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip;
> +               kvm_run->debug.arch.exception = DB_VECTOR;
> +               kvm_run->exit_reason = KVM_EXIT_DEBUG;
> +               *r = EMULATE_USER_EXIT;
> +       } else {
> +               /*
> +                * "Certain debug exceptions may clear bit 0-3.  The
> +                * remaining contents of the DR6 register are never
> +                * cleared by the processor".
> +                */
> +               vcpu->arch.dr6 &= ~15;
> +               vcpu->arch.dr6 |= DR6_BS | DR6_RTM;
> +               kvm_queue_exception(vcpu, DB_VECTOR);
>         }
>  }
>
> @@ -5567,7 +5558,17 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
>         int r = EMULATE_DONE;
>
>         kvm_x86_ops->skip_emulated_instruction(vcpu);
> -       kvm_vcpu_check_singlestep(vcpu, rflags, &r);
> +
> +       /*
> +        * rflags is the old, "raw" value of the flags.  The new value has
> +        * not been saved yet.
> +        *
> +        * This is correct even for TF set by the guest, because "the
> +        * processor will not generate this exception after the instruction
> +        * that sets the TF flag".
> +        */
> +       if (unlikely(rflags & X86_EFLAGS_TF))
> +               kvm_vcpu_do_singlestep(vcpu, &r);
>         return r == EMULATE_DONE;
>  }
>  EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction);
> @@ -5726,8 +5727,9 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
>                 toggle_interruptibility(vcpu, ctxt->interruptibility);
>                 vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
>                 kvm_rip_write(vcpu, ctxt->eip);
> -               if (r == EMULATE_DONE)
> -                       kvm_vcpu_check_singlestep(vcpu, rflags, &r);
> +               if (r == EMULATE_DONE &&
> +                   (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
> +                       kvm_vcpu_do_singlestep(vcpu, &r);
>                 if (!ctxt->have_exception ||
>                     exception_type(ctxt->exception.vector) == EXCPT_TRAP)
>                         __kvm_set_rflags(vcpu, ctxt->eflags);
> --
> 2.13.1
>
Paolo Bonzini June 27, 2017, 8:20 a.m. UTC | #2
On 27/06/2017 05:41, Wanpeng Li wrote:
>> KVM emulates syscall so that it can trap 32-bit syscall on Intel processors.
>
> We have a discussion to not expose syscall/sysret to Intel 32-bit
> guest two years ago. https://lkml.org/lkml/2015/11/19/225 The
> syscall/sysret just makes sense against long mode instead of
> compatibility/legacy mode of Intel CPU. We will get a #UD in 32-bit
> guest, and syscall emulation is introduced by commit 66bb2ccd (KVM:
> x86 emulator: add syscall emulation) to handle it. So why we still
> expose syscall/sysret to Intel 32-bit guest?

Because you didn't post v2 of that patch, I guess. :)

>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> index 87d3cb901935..0e846f0cb83b 100644
>> --- a/arch/x86/kvm/x86.c
>> +++ b/arch/x86/kvm/x86.c
>> @@ -5313,6 +5313,8 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
>>         kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
>>
>>         ctxt->eflags = kvm_get_rflags(vcpu);
>> +       ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0;
>> +
>
> I guess this is used for "the sysret is executed the #DB is taken "as
> if" the syscall insn just completed", however, there is no sysret
> emulation, so how the #DB is taken after the sysret?

No, it's used for instructions other than syscall and sysret:

> +               if (r == EMULATE_DONE &&
> +                   (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
> +                       kvm_vcpu_do_singlestep(vcpu, &r);

syscall (and sysret if it were emulated) overwrite ctxt->tf with the
value of TF at the end of the instruction.  Other instructions don't, so
that singlestep depends on EFLAGS.TF before the instruction is executed.

Paolo
Wanpeng Li June 27, 2017, 9:50 a.m. UTC | #3
2017-06-27 16:20 GMT+08:00 Paolo Bonzini <pbonzini@redhat.com>:
>
>
> On 27/06/2017 05:41, Wanpeng Li wrote:
>>> KVM emulates syscall so that it can trap 32-bit syscall on Intel processors.
>>
>> We have a discussion to not expose syscall/sysret to Intel 32-bit
>> guest two years ago. https://lkml.org/lkml/2015/11/19/225 The
>> syscall/sysret just makes sense against long mode instead of

s/long mode/64-bit mode

>> compatibility/legacy mode of Intel CPU. We will get a #UD in 32-bit
>> guest, and syscall emulation is introduced by commit 66bb2ccd (KVM:
>> x86 emulator: add syscall emulation) to handle it. So why we still
>> expose syscall/sysret to Intel 32-bit guest?
>
> Because you didn't post v2 of that patch, I guess. :)
>
>>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>>> index 87d3cb901935..0e846f0cb83b 100644
>>> --- a/arch/x86/kvm/x86.c
>>> +++ b/arch/x86/kvm/x86.c
>>> @@ -5313,6 +5313,8 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
>>>         kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
>>>
>>>         ctxt->eflags = kvm_get_rflags(vcpu);
>>> +       ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0;
>>> +
>>
>> I guess this is used for "the sysret is executed the #DB is taken "as
>> if" the syscall insn just completed", however, there is no sysret
>> emulation, so how the #DB is taken after the sysret?
>
> No, it's used for instructions other than syscall and sysret:
>
>> +               if (r == EMULATE_DONE &&
>> +                   (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
>> +                       kvm_vcpu_do_singlestep(vcpu, &r);
>
> syscall (and sysret if it were emulated) overwrite ctxt->tf with the
> value of TF at the end of the instruction.  Other instructions don't, so
> that singlestep depends on EFLAGS.TF before the instruction is executed.

Why sysret is not emulated since SDM said that it can incur a #UD if
not in 64-bit mode?

Regards,
Wanpeng Li
Paolo Bonzini June 27, 2017, 10:09 a.m. UTC | #4
On 27/06/2017 11:50, Wanpeng Li wrote:
> 2017-06-27 16:20 GMT+08:00 Paolo Bonzini <pbonzini@redhat.com>:
>>
>>
>> On 27/06/2017 05:41, Wanpeng Li wrote:
>>>> KVM emulates syscall so that it can trap 32-bit syscall on Intel processors.
>>>
>>> We have a discussion to not expose syscall/sysret to Intel 32-bit
>>> guest two years ago. https://lkml.org/lkml/2015/11/19/225 The
>>> syscall/sysret just makes sense against long mode instead of
> 
> s/long mode/64-bit mode
> 
>>> compatibility/legacy mode of Intel CPU. We will get a #UD in 32-bit
>>> guest, and syscall emulation is introduced by commit 66bb2ccd (KVM:
>>> x86 emulator: add syscall emulation) to handle it. So why we still
>>> expose syscall/sysret to Intel 32-bit guest?
>>
>> Because you didn't post v2 of that patch, I guess. :)
>>
>>>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>>>> index 87d3cb901935..0e846f0cb83b 100644
>>>> --- a/arch/x86/kvm/x86.c
>>>> +++ b/arch/x86/kvm/x86.c
>>>> @@ -5313,6 +5313,8 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
>>>>         kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
>>>>
>>>>         ctxt->eflags = kvm_get_rflags(vcpu);
>>>> +       ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0;
>>>> +
>>>
>>> I guess this is used for "the sysret is executed the #DB is taken "as
>>> if" the syscall insn just completed", however, there is no sysret
>>> emulation, so how the #DB is taken after the sysret?
>>
>> No, it's used for instructions other than syscall and sysret:
>>
>>> +               if (r == EMULATE_DONE &&
>>> +                   (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
>>> +                       kvm_vcpu_do_singlestep(vcpu, &r);
>>
>> syscall (and sysret if it were emulated) overwrite ctxt->tf with the
>> value of TF at the end of the instruction.  Other instructions don't, so
>> that singlestep depends on EFLAGS.TF before the instruction is executed.
> 
> Why sysret is not emulated since SDM said that it can incur a #UD if
> not in 64-bit mode?

"64-bit ring 0 to 32-bit ring 3" sysret ("sysretl") is supported by Intel:

	IF (operand size is 64-bit)
	THEN CS.Selector ← IA32_STAR[63:48]+16;
	ELSE CS.Selector ← IA32_STAR[63:48];
	FI;

If you want to add support for emulating sysret, in particular legacy
mode sysret, that would be okay.  You can extend the new testcase to run
in 32-bit mode, too.

Paolo
Wanpeng Li June 27, 2017, 11:58 a.m. UTC | #5
2017-06-27 18:09 GMT+08:00 Paolo Bonzini <pbonzini@redhat.com>:
>
>
> On 27/06/2017 11:50, Wanpeng Li wrote:
>> 2017-06-27 16:20 GMT+08:00 Paolo Bonzini <pbonzini@redhat.com>:
>>>
>>>
>>> On 27/06/2017 05:41, Wanpeng Li wrote:
>>>>> KVM emulates syscall so that it can trap 32-bit syscall on Intel processors.
>>>>
>>>> We have a discussion to not expose syscall/sysret to Intel 32-bit
>>>> guest two years ago. https://lkml.org/lkml/2015/11/19/225 The
>>>> syscall/sysret just makes sense against long mode instead of
>>
>> s/long mode/64-bit mode
>>
>>>> compatibility/legacy mode of Intel CPU. We will get a #UD in 32-bit
>>>> guest, and syscall emulation is introduced by commit 66bb2ccd (KVM:
>>>> x86 emulator: add syscall emulation) to handle it. So why we still
>>>> expose syscall/sysret to Intel 32-bit guest?
>>>
>>> Because you didn't post v2 of that patch, I guess. :)
>>>
>>>>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>>>>> index 87d3cb901935..0e846f0cb83b 100644
>>>>> --- a/arch/x86/kvm/x86.c
>>>>> +++ b/arch/x86/kvm/x86.c
>>>>> @@ -5313,6 +5313,8 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
>>>>>         kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
>>>>>
>>>>>         ctxt->eflags = kvm_get_rflags(vcpu);
>>>>> +       ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0;
>>>>> +
>>>>
>>>> I guess this is used for "the sysret is executed the #DB is taken "as
>>>> if" the syscall insn just completed", however, there is no sysret
>>>> emulation, so how the #DB is taken after the sysret?
>>>
>>> No, it's used for instructions other than syscall and sysret:
>>>
>>>> +               if (r == EMULATE_DONE &&
>>>> +                   (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
>>>> +                       kvm_vcpu_do_singlestep(vcpu, &r);
>>>
>>> syscall (and sysret if it were emulated) overwrite ctxt->tf with the
>>> value of TF at the end of the instruction.  Other instructions don't, so
>>> that singlestep depends on EFLAGS.TF before the instruction is executed.
>>
>> Why sysret is not emulated since SDM said that it can incur a #UD if
>> not in 64-bit mode?
>
> "64-bit ring 0 to 32-bit ring 3" sysret ("sysretl") is supported by Intel:
>
>         IF (operand size is 64-bit)
>         THEN CS.Selector ← IA32_STAR[63:48]+16;
>         ELSE CS.Selector ← IA32_STAR[63:48];
>         FI;

I see. :)

Regards,
Wanpeng Li
diff mbox

Patch

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 055962615779..722d0e568863 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -296,6 +296,7 @@  struct x86_emulate_ctxt {
 
 	bool perm_ok; /* do not check permissions if true */
 	bool ud;	/* inject an #UD if host doesn't support insn */
+	bool tf;	/* TF value before instruction (after for syscall/sysret) */
 
 	bool have_exception;
 	struct x86_exception exception;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 0816ab2e8adc..80890dee66ce 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2742,6 +2742,7 @@  static int em_syscall(struct x86_emulate_ctxt *ctxt)
 		ctxt->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IF);
 	}
 
+	ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0;
 	return X86EMUL_CONTINUE;
 }
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 87d3cb901935..0e846f0cb83b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5313,6 +5313,8 @@  static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
 	kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
 
 	ctxt->eflags = kvm_get_rflags(vcpu);
+	ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0;
+
 	ctxt->eip = kvm_rip_read(vcpu);
 	ctxt->mode = (!is_protmode(vcpu))		? X86EMUL_MODE_REAL :
 		     (ctxt->eflags & X86_EFLAGS_VM)	? X86EMUL_MODE_VM86 :
@@ -5528,36 +5530,25 @@  static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
 	return dr6;
 }
 
-static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, unsigned long rflags, int *r)
+static void kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu, int *r)
 {
 	struct kvm_run *kvm_run = vcpu->run;
 
-	/*
-	 * rflags is the old, "raw" value of the flags.  The new value has
-	 * not been saved yet.
-	 *
-	 * This is correct even for TF set by the guest, because "the
-	 * processor will not generate this exception after the instruction
-	 * that sets the TF flag".
-	 */
-	if (unlikely(rflags & X86_EFLAGS_TF)) {
-		if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
-			kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 |
-						  DR6_RTM;
-			kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip;
-			kvm_run->debug.arch.exception = DB_VECTOR;
-			kvm_run->exit_reason = KVM_EXIT_DEBUG;
-			*r = EMULATE_USER_EXIT;
-		} else {
-			/*
-			 * "Certain debug exceptions may clear bit 0-3.  The
-			 * remaining contents of the DR6 register are never
-			 * cleared by the processor".
-			 */
-			vcpu->arch.dr6 &= ~15;
-			vcpu->arch.dr6 |= DR6_BS | DR6_RTM;
-			kvm_queue_exception(vcpu, DB_VECTOR);
-		}
+	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
+		kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 | DR6_RTM;
+		kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip;
+		kvm_run->debug.arch.exception = DB_VECTOR;
+		kvm_run->exit_reason = KVM_EXIT_DEBUG;
+		*r = EMULATE_USER_EXIT;
+	} else {
+		/*
+		 * "Certain debug exceptions may clear bit 0-3.  The
+		 * remaining contents of the DR6 register are never
+		 * cleared by the processor".
+		 */
+		vcpu->arch.dr6 &= ~15;
+		vcpu->arch.dr6 |= DR6_BS | DR6_RTM;
+		kvm_queue_exception(vcpu, DB_VECTOR);
 	}
 }
 
@@ -5567,7 +5558,17 @@  int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
 	int r = EMULATE_DONE;
 
 	kvm_x86_ops->skip_emulated_instruction(vcpu);
-	kvm_vcpu_check_singlestep(vcpu, rflags, &r);
+
+	/*
+	 * rflags is the old, "raw" value of the flags.  The new value has
+	 * not been saved yet.
+	 *
+	 * This is correct even for TF set by the guest, because "the
+	 * processor will not generate this exception after the instruction
+	 * that sets the TF flag".
+	 */
+	if (unlikely(rflags & X86_EFLAGS_TF))
+		kvm_vcpu_do_singlestep(vcpu, &r);
 	return r == EMULATE_DONE;
 }
 EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction);
@@ -5726,8 +5727,9 @@  int x86_emulate_instruction(struct kvm_vcpu *vcpu,
 		toggle_interruptibility(vcpu, ctxt->interruptibility);
 		vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
 		kvm_rip_write(vcpu, ctxt->eip);
-		if (r == EMULATE_DONE)
-			kvm_vcpu_check_singlestep(vcpu, rflags, &r);
+		if (r == EMULATE_DONE &&
+		    (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
+			kvm_vcpu_do_singlestep(vcpu, &r);
 		if (!ctxt->have_exception ||
 		    exception_type(ctxt->exception.vector) == EXCPT_TRAP)
 			__kvm_set_rflags(vcpu, ctxt->eflags);