Message ID | 20170622151040.16231-1-rkrcmar@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
2017-06-22 23:10 GMT+08:00 Radim Krčmář <rkrcmar@redhat.com>: > From: Paolo Bonzini <pbonzini@redhat.com> > > TF is handled a bit differently for syscall and sysret, compared > to the other instructions: TF is checked after the instruction completes, > so that the OS can disable #DB at a syscall by adding TF to FMASK. > When the sysret is executed the #DB is taken "as if" the syscall insn > just completed. > > KVM emulates syscall so that it can trap 32-bit syscall on Intel processors. We have a discussion to not expose syscall/sysret to Intel 32-bit guest two years ago. https://lkml.org/lkml/2015/11/19/225 The syscall/sysret just makes sense against long mode instead of compatibility/legacy mode of Intel CPU. We will get a #UD in 32-bit guest, and syscall emulation is introduced by commit 66bb2ccd (KVM: x86 emulator: add syscall emulation) to handle it. So why we still expose syscall/sysret to Intel 32-bit guest? > Fix the behavior, otherwise you could get #DB on a user stack which is not > nice. This does not affect Linux guests, as they use an IST or task gate > for #DB. > > This fixes CVE-2017-7518. > > Cc: stable@vger.kernel.org > Reported-by: Andy Lutomirski <luto@kernel.org> > Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> > Signed-off-by: Radim Krčmář <rkrcmar@redhat.com> > --- > The patch is included in the following pull request to Linus. > > arch/x86/include/asm/kvm_emulate.h | 1 + > arch/x86/kvm/emulate.c | 1 + > arch/x86/kvm/x86.c | 62 ++++++++++++++++++++------------------ > 3 files changed, 34 insertions(+), 30 deletions(-) > > diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h > index 055962615779..722d0e568863 100644 > --- a/arch/x86/include/asm/kvm_emulate.h > +++ b/arch/x86/include/asm/kvm_emulate.h > @@ -296,6 +296,7 @@ struct x86_emulate_ctxt { > > bool perm_ok; /* do not check permissions if true */ > bool ud; /* inject an #UD if host doesn't support insn */ > + bool tf; /* TF value before instruction (after for syscall/sysret) */ > > bool have_exception; > struct x86_exception exception; > diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c > index 0816ab2e8adc..80890dee66ce 100644 > --- a/arch/x86/kvm/emulate.c > +++ b/arch/x86/kvm/emulate.c > @@ -2742,6 +2742,7 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt) > ctxt->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IF); > } > > + ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0; > return X86EMUL_CONTINUE; > } > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index 87d3cb901935..0e846f0cb83b 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -5313,6 +5313,8 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) > kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); > > ctxt->eflags = kvm_get_rflags(vcpu); > + ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0; > + I guess this is used for "the sysret is executed the #DB is taken "as if" the syscall insn just completed", however, there is no sysret emulation, so how the #DB is taken after the sysret? Regards, Wanpeng Li > ctxt->eip = kvm_rip_read(vcpu); > ctxt->mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : > (ctxt->eflags & X86_EFLAGS_VM) ? X86EMUL_MODE_VM86 : > @@ -5528,36 +5530,25 @@ static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7, > return dr6; > } > > -static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, unsigned long rflags, int *r) > +static void kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu, int *r) > { > struct kvm_run *kvm_run = vcpu->run; > > - /* > - * rflags is the old, "raw" value of the flags. The new value has > - * not been saved yet. > - * > - * This is correct even for TF set by the guest, because "the > - * processor will not generate this exception after the instruction > - * that sets the TF flag". > - */ > - if (unlikely(rflags & X86_EFLAGS_TF)) { > - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { > - kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 | > - DR6_RTM; > - kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip; > - kvm_run->debug.arch.exception = DB_VECTOR; > - kvm_run->exit_reason = KVM_EXIT_DEBUG; > - *r = EMULATE_USER_EXIT; > - } else { > - /* > - * "Certain debug exceptions may clear bit 0-3. The > - * remaining contents of the DR6 register are never > - * cleared by the processor". > - */ > - vcpu->arch.dr6 &= ~15; > - vcpu->arch.dr6 |= DR6_BS | DR6_RTM; > - kvm_queue_exception(vcpu, DB_VECTOR); > - } > + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { > + kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 | DR6_RTM; > + kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip; > + kvm_run->debug.arch.exception = DB_VECTOR; > + kvm_run->exit_reason = KVM_EXIT_DEBUG; > + *r = EMULATE_USER_EXIT; > + } else { > + /* > + * "Certain debug exceptions may clear bit 0-3. The > + * remaining contents of the DR6 register are never > + * cleared by the processor". > + */ > + vcpu->arch.dr6 &= ~15; > + vcpu->arch.dr6 |= DR6_BS | DR6_RTM; > + kvm_queue_exception(vcpu, DB_VECTOR); > } > } > > @@ -5567,7 +5558,17 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu) > int r = EMULATE_DONE; > > kvm_x86_ops->skip_emulated_instruction(vcpu); > - kvm_vcpu_check_singlestep(vcpu, rflags, &r); > + > + /* > + * rflags is the old, "raw" value of the flags. The new value has > + * not been saved yet. > + * > + * This is correct even for TF set by the guest, because "the > + * processor will not generate this exception after the instruction > + * that sets the TF flag". > + */ > + if (unlikely(rflags & X86_EFLAGS_TF)) > + kvm_vcpu_do_singlestep(vcpu, &r); > return r == EMULATE_DONE; > } > EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction); > @@ -5726,8 +5727,9 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, > toggle_interruptibility(vcpu, ctxt->interruptibility); > vcpu->arch.emulate_regs_need_sync_to_vcpu = false; > kvm_rip_write(vcpu, ctxt->eip); > - if (r == EMULATE_DONE) > - kvm_vcpu_check_singlestep(vcpu, rflags, &r); > + if (r == EMULATE_DONE && > + (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP))) > + kvm_vcpu_do_singlestep(vcpu, &r); > if (!ctxt->have_exception || > exception_type(ctxt->exception.vector) == EXCPT_TRAP) > __kvm_set_rflags(vcpu, ctxt->eflags); > -- > 2.13.1 >
On 27/06/2017 05:41, Wanpeng Li wrote: >> KVM emulates syscall so that it can trap 32-bit syscall on Intel processors. > > We have a discussion to not expose syscall/sysret to Intel 32-bit > guest two years ago. https://lkml.org/lkml/2015/11/19/225 The > syscall/sysret just makes sense against long mode instead of > compatibility/legacy mode of Intel CPU. We will get a #UD in 32-bit > guest, and syscall emulation is introduced by commit 66bb2ccd (KVM: > x86 emulator: add syscall emulation) to handle it. So why we still > expose syscall/sysret to Intel 32-bit guest? Because you didn't post v2 of that patch, I guess. :) >> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c >> index 87d3cb901935..0e846f0cb83b 100644 >> --- a/arch/x86/kvm/x86.c >> +++ b/arch/x86/kvm/x86.c >> @@ -5313,6 +5313,8 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) >> kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); >> >> ctxt->eflags = kvm_get_rflags(vcpu); >> + ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0; >> + > > I guess this is used for "the sysret is executed the #DB is taken "as > if" the syscall insn just completed", however, there is no sysret > emulation, so how the #DB is taken after the sysret? No, it's used for instructions other than syscall and sysret: > + if (r == EMULATE_DONE && > + (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP))) > + kvm_vcpu_do_singlestep(vcpu, &r); syscall (and sysret if it were emulated) overwrite ctxt->tf with the value of TF at the end of the instruction. Other instructions don't, so that singlestep depends on EFLAGS.TF before the instruction is executed. Paolo
2017-06-27 16:20 GMT+08:00 Paolo Bonzini <pbonzini@redhat.com>: > > > On 27/06/2017 05:41, Wanpeng Li wrote: >>> KVM emulates syscall so that it can trap 32-bit syscall on Intel processors. >> >> We have a discussion to not expose syscall/sysret to Intel 32-bit >> guest two years ago. https://lkml.org/lkml/2015/11/19/225 The >> syscall/sysret just makes sense against long mode instead of s/long mode/64-bit mode >> compatibility/legacy mode of Intel CPU. We will get a #UD in 32-bit >> guest, and syscall emulation is introduced by commit 66bb2ccd (KVM: >> x86 emulator: add syscall emulation) to handle it. So why we still >> expose syscall/sysret to Intel 32-bit guest? > > Because you didn't post v2 of that patch, I guess. :) > >>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c >>> index 87d3cb901935..0e846f0cb83b 100644 >>> --- a/arch/x86/kvm/x86.c >>> +++ b/arch/x86/kvm/x86.c >>> @@ -5313,6 +5313,8 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) >>> kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); >>> >>> ctxt->eflags = kvm_get_rflags(vcpu); >>> + ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0; >>> + >> >> I guess this is used for "the sysret is executed the #DB is taken "as >> if" the syscall insn just completed", however, there is no sysret >> emulation, so how the #DB is taken after the sysret? > > No, it's used for instructions other than syscall and sysret: > >> + if (r == EMULATE_DONE && >> + (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP))) >> + kvm_vcpu_do_singlestep(vcpu, &r); > > syscall (and sysret if it were emulated) overwrite ctxt->tf with the > value of TF at the end of the instruction. Other instructions don't, so > that singlestep depends on EFLAGS.TF before the instruction is executed. Why sysret is not emulated since SDM said that it can incur a #UD if not in 64-bit mode? Regards, Wanpeng Li
On 27/06/2017 11:50, Wanpeng Li wrote: > 2017-06-27 16:20 GMT+08:00 Paolo Bonzini <pbonzini@redhat.com>: >> >> >> On 27/06/2017 05:41, Wanpeng Li wrote: >>>> KVM emulates syscall so that it can trap 32-bit syscall on Intel processors. >>> >>> We have a discussion to not expose syscall/sysret to Intel 32-bit >>> guest two years ago. https://lkml.org/lkml/2015/11/19/225 The >>> syscall/sysret just makes sense against long mode instead of > > s/long mode/64-bit mode > >>> compatibility/legacy mode of Intel CPU. We will get a #UD in 32-bit >>> guest, and syscall emulation is introduced by commit 66bb2ccd (KVM: >>> x86 emulator: add syscall emulation) to handle it. So why we still >>> expose syscall/sysret to Intel 32-bit guest? >> >> Because you didn't post v2 of that patch, I guess. :) >> >>>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c >>>> index 87d3cb901935..0e846f0cb83b 100644 >>>> --- a/arch/x86/kvm/x86.c >>>> +++ b/arch/x86/kvm/x86.c >>>> @@ -5313,6 +5313,8 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) >>>> kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); >>>> >>>> ctxt->eflags = kvm_get_rflags(vcpu); >>>> + ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0; >>>> + >>> >>> I guess this is used for "the sysret is executed the #DB is taken "as >>> if" the syscall insn just completed", however, there is no sysret >>> emulation, so how the #DB is taken after the sysret? >> >> No, it's used for instructions other than syscall and sysret: >> >>> + if (r == EMULATE_DONE && >>> + (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP))) >>> + kvm_vcpu_do_singlestep(vcpu, &r); >> >> syscall (and sysret if it were emulated) overwrite ctxt->tf with the >> value of TF at the end of the instruction. Other instructions don't, so >> that singlestep depends on EFLAGS.TF before the instruction is executed. > > Why sysret is not emulated since SDM said that it can incur a #UD if > not in 64-bit mode? "64-bit ring 0 to 32-bit ring 3" sysret ("sysretl") is supported by Intel: IF (operand size is 64-bit) THEN CS.Selector ← IA32_STAR[63:48]+16; ELSE CS.Selector ← IA32_STAR[63:48]; FI; If you want to add support for emulating sysret, in particular legacy mode sysret, that would be okay. You can extend the new testcase to run in 32-bit mode, too. Paolo
2017-06-27 18:09 GMT+08:00 Paolo Bonzini <pbonzini@redhat.com>: > > > On 27/06/2017 11:50, Wanpeng Li wrote: >> 2017-06-27 16:20 GMT+08:00 Paolo Bonzini <pbonzini@redhat.com>: >>> >>> >>> On 27/06/2017 05:41, Wanpeng Li wrote: >>>>> KVM emulates syscall so that it can trap 32-bit syscall on Intel processors. >>>> >>>> We have a discussion to not expose syscall/sysret to Intel 32-bit >>>> guest two years ago. https://lkml.org/lkml/2015/11/19/225 The >>>> syscall/sysret just makes sense against long mode instead of >> >> s/long mode/64-bit mode >> >>>> compatibility/legacy mode of Intel CPU. We will get a #UD in 32-bit >>>> guest, and syscall emulation is introduced by commit 66bb2ccd (KVM: >>>> x86 emulator: add syscall emulation) to handle it. So why we still >>>> expose syscall/sysret to Intel 32-bit guest? >>> >>> Because you didn't post v2 of that patch, I guess. :) >>> >>>>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c >>>>> index 87d3cb901935..0e846f0cb83b 100644 >>>>> --- a/arch/x86/kvm/x86.c >>>>> +++ b/arch/x86/kvm/x86.c >>>>> @@ -5313,6 +5313,8 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) >>>>> kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); >>>>> >>>>> ctxt->eflags = kvm_get_rflags(vcpu); >>>>> + ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0; >>>>> + >>>> >>>> I guess this is used for "the sysret is executed the #DB is taken "as >>>> if" the syscall insn just completed", however, there is no sysret >>>> emulation, so how the #DB is taken after the sysret? >>> >>> No, it's used for instructions other than syscall and sysret: >>> >>>> + if (r == EMULATE_DONE && >>>> + (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP))) >>>> + kvm_vcpu_do_singlestep(vcpu, &r); >>> >>> syscall (and sysret if it were emulated) overwrite ctxt->tf with the >>> value of TF at the end of the instruction. Other instructions don't, so >>> that singlestep depends on EFLAGS.TF before the instruction is executed. >> >> Why sysret is not emulated since SDM said that it can incur a #UD if >> not in 64-bit mode? > > "64-bit ring 0 to 32-bit ring 3" sysret ("sysretl") is supported by Intel: > > IF (operand size is 64-bit) > THEN CS.Selector ← IA32_STAR[63:48]+16; > ELSE CS.Selector ← IA32_STAR[63:48]; > FI; I see. :) Regards, Wanpeng Li
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 055962615779..722d0e568863 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -296,6 +296,7 @@ struct x86_emulate_ctxt { bool perm_ok; /* do not check permissions if true */ bool ud; /* inject an #UD if host doesn't support insn */ + bool tf; /* TF value before instruction (after for syscall/sysret) */ bool have_exception; struct x86_exception exception; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 0816ab2e8adc..80890dee66ce 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2742,6 +2742,7 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt) ctxt->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IF); } + ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0; return X86EMUL_CONTINUE; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 87d3cb901935..0e846f0cb83b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5313,6 +5313,8 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); ctxt->eflags = kvm_get_rflags(vcpu); + ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0; + ctxt->eip = kvm_rip_read(vcpu); ctxt->mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : (ctxt->eflags & X86_EFLAGS_VM) ? X86EMUL_MODE_VM86 : @@ -5528,36 +5530,25 @@ static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7, return dr6; } -static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, unsigned long rflags, int *r) +static void kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu, int *r) { struct kvm_run *kvm_run = vcpu->run; - /* - * rflags is the old, "raw" value of the flags. The new value has - * not been saved yet. - * - * This is correct even for TF set by the guest, because "the - * processor will not generate this exception after the instruction - * that sets the TF flag". - */ - if (unlikely(rflags & X86_EFLAGS_TF)) { - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { - kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 | - DR6_RTM; - kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip; - kvm_run->debug.arch.exception = DB_VECTOR; - kvm_run->exit_reason = KVM_EXIT_DEBUG; - *r = EMULATE_USER_EXIT; - } else { - /* - * "Certain debug exceptions may clear bit 0-3. The - * remaining contents of the DR6 register are never - * cleared by the processor". - */ - vcpu->arch.dr6 &= ~15; - vcpu->arch.dr6 |= DR6_BS | DR6_RTM; - kvm_queue_exception(vcpu, DB_VECTOR); - } + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { + kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 | DR6_RTM; + kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip; + kvm_run->debug.arch.exception = DB_VECTOR; + kvm_run->exit_reason = KVM_EXIT_DEBUG; + *r = EMULATE_USER_EXIT; + } else { + /* + * "Certain debug exceptions may clear bit 0-3. The + * remaining contents of the DR6 register are never + * cleared by the processor". + */ + vcpu->arch.dr6 &= ~15; + vcpu->arch.dr6 |= DR6_BS | DR6_RTM; + kvm_queue_exception(vcpu, DB_VECTOR); } } @@ -5567,7 +5558,17 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu) int r = EMULATE_DONE; kvm_x86_ops->skip_emulated_instruction(vcpu); - kvm_vcpu_check_singlestep(vcpu, rflags, &r); + + /* + * rflags is the old, "raw" value of the flags. The new value has + * not been saved yet. + * + * This is correct even for TF set by the guest, because "the + * processor will not generate this exception after the instruction + * that sets the TF flag". + */ + if (unlikely(rflags & X86_EFLAGS_TF)) + kvm_vcpu_do_singlestep(vcpu, &r); return r == EMULATE_DONE; } EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction); @@ -5726,8 +5727,9 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, toggle_interruptibility(vcpu, ctxt->interruptibility); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; kvm_rip_write(vcpu, ctxt->eip); - if (r == EMULATE_DONE) - kvm_vcpu_check_singlestep(vcpu, rflags, &r); + if (r == EMULATE_DONE && + (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP))) + kvm_vcpu_do_singlestep(vcpu, &r); if (!ctxt->have_exception || exception_type(ctxt->exception.vector) == EXCPT_TRAP) __kvm_set_rflags(vcpu, ctxt->eflags);