diff mbox

[v4,2/5] x86/msr: Carry on after a non-"safe" MSR access fails without !panic_on_oops

Message ID a3b871a4eb533340d04255409dfecc94f88c647d.1457805972.git.luto@kernel.org (mailing list archive)
State New, archived
Headers show

Commit Message

Andy Lutomirski March 12, 2016, 6:08 p.m. UTC
This demotes an OOPS and likely panic due to a failed non-"safe" MSR
access to a WARN_ONCE and, for RDMSR, a return value of zero.  If
panic_on_oops is set, then failed unsafe MSR accesses will still
oops and panic.

To be clear, this type of failure should *not* happen.  This patch
exists to minimize the chance of nasty undebuggable failures due on
systems that used to work due to a now-fixed CONFIG_PARAVIRT=y bug.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
---
 arch/x86/include/asm/msr.h | 10 ++++++++--
 arch/x86/mm/extable.c      | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 41 insertions(+), 2 deletions(-)

Comments

Borislav Petkov March 14, 2016, 12:02 p.m. UTC | #1
On Sat, Mar 12, 2016 at 10:08:49AM -0800, Andy Lutomirski wrote:
> This demotes an OOPS and likely panic due to a failed non-"safe" MSR
> access to a WARN_ONCE and, for RDMSR, a return value of zero.  If
> panic_on_oops is set, then failed unsafe MSR accesses will still
> oops and panic.
> 
> To be clear, this type of failure should *not* happen.  This patch
> exists to minimize the chance of nasty undebuggable failures due on
> systems that used to work due to a now-fixed CONFIG_PARAVIRT=y bug.
> 
> Signed-off-by: Andy Lutomirski <luto@kernel.org>
> ---
>  arch/x86/include/asm/msr.h | 10 ++++++++--
>  arch/x86/mm/extable.c      | 33 +++++++++++++++++++++++++++++++++
>  2 files changed, 41 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
> index 93fb7c1cffda..1487054a1a70 100644
> --- a/arch/x86/include/asm/msr.h
> +++ b/arch/x86/include/asm/msr.h
> @@ -92,7 +92,10 @@ static inline unsigned long long native_read_msr(unsigned int msr)
>  {
>  	DECLARE_ARGS(val, low, high);
>  
> -	asm volatile("rdmsr" : EAX_EDX_RET(val, low, high) : "c" (msr));
> +	asm volatile("1: rdmsr\n"
> +		     "2:\n"
> +		     _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_rdmsr_unsafe)
> +		     : EAX_EDX_RET(val, low, high) : "c" (msr));
>  	if (msr_tracepoint_active(__tracepoint_read_msr))
>  		do_trace_read_msr(msr, EAX_EDX_VAL(val, low, high), 0);
>  	return EAX_EDX_VAL(val, low, high);
> @@ -119,7 +122,10 @@ static inline unsigned long long native_read_msr_safe(unsigned int msr,
>  static inline void native_write_msr(unsigned int msr,
>  				    unsigned low, unsigned high)
>  {
> -	asm volatile("wrmsr" : : "c" (msr), "a"(low), "d" (high) : "memory");
> +	asm volatile("1: wrmsr\n"
> +		     "2:\n"
> +		     _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_wrmsr_unsafe)

This might be a good idea:

[    0.220066] cpuidle: using governor menu
[    0.224000] ------------[ cut here ]------------
[    0.224000] WARNING: CPU: 0 PID: 1 at arch/x86/mm/extable.c:74 ex_handler_wrmsr_unsafe+0x73/0x80()
[    0.224000] unchecked MSR access error: WRMSR to 0xdeadbeef (tried to write 0x000000000000caca)
[    0.224000] Modules linked in:
[    0.224000] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.5.0-rc7+ #7
[    0.224000] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.7.5-20140531_083030-gandalf 04/01/2014
[    0.224000]  0000000000000000 ffff88007c0d7c08 ffffffff812f13a3 ffff88007c0d7c50
[    0.224000]  ffffffff81a40ffe ffff88007c0d7c40 ffffffff8105c3b1 ffffffff81717710
[    0.224000]  ffff88007c0d7d18 0000000000000000 ffffffff816207d0 0000000000000000
[    0.224000] Call Trace:
[    0.224000]  [<ffffffff812f13a3>] dump_stack+0x67/0x94
[    0.224000]  [<ffffffff8105c3b1>] warn_slowpath_common+0x91/0xd0
[    0.224000]  [<ffffffff816207d0>] ? amd_cpu_notify+0x40/0x40
[    0.224000]  [<ffffffff8105c43c>] warn_slowpath_fmt+0x4c/0x50
[    0.224000]  [<ffffffff816207d0>] ? amd_cpu_notify+0x40/0x40
[    0.224000]  [<ffffffff8131de53>] ? __this_cpu_preempt_check+0x13/0x20
[    0.224000]  [<ffffffff8104efe3>] ex_handler_wrmsr_unsafe+0x73/0x80

and it looks helpful and all but when you do it pretty early - for
example I added a

	 wrmsrl(0xdeadbeef, 0xcafe);

at the end of pat_bsp_init() and the machine explodes with an early
panic. I'm wondering what is better - early panic or an early #GP from a
missing MSR.

And more specifically, can we do better to handle the early case
gracefully too?
Andy Lutomirski March 14, 2016, 5:05 p.m. UTC | #2
On Mon, Mar 14, 2016 at 5:02 AM, Borislav Petkov <bp@alien8.de> wrote:
> On Sat, Mar 12, 2016 at 10:08:49AM -0800, Andy Lutomirski wrote:
>> This demotes an OOPS and likely panic due to a failed non-"safe" MSR
>> access to a WARN_ONCE and, for RDMSR, a return value of zero.  If
>> panic_on_oops is set, then failed unsafe MSR accesses will still
>> oops and panic.
>>
>> To be clear, this type of failure should *not* happen.  This patch
>> exists to minimize the chance of nasty undebuggable failures due on
>> systems that used to work due to a now-fixed CONFIG_PARAVIRT=y bug.
>>
>> Signed-off-by: Andy Lutomirski <luto@kernel.org>
>> ---
>>  arch/x86/include/asm/msr.h | 10 ++++++++--
>>  arch/x86/mm/extable.c      | 33 +++++++++++++++++++++++++++++++++
>>  2 files changed, 41 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
>> index 93fb7c1cffda..1487054a1a70 100644
>> --- a/arch/x86/include/asm/msr.h
>> +++ b/arch/x86/include/asm/msr.h
>> @@ -92,7 +92,10 @@ static inline unsigned long long native_read_msr(unsigned int msr)
>>  {
>>       DECLARE_ARGS(val, low, high);
>>
>> -     asm volatile("rdmsr" : EAX_EDX_RET(val, low, high) : "c" (msr));
>> +     asm volatile("1: rdmsr\n"
>> +                  "2:\n"
>> +                  _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_rdmsr_unsafe)
>> +                  : EAX_EDX_RET(val, low, high) : "c" (msr));
>>       if (msr_tracepoint_active(__tracepoint_read_msr))
>>               do_trace_read_msr(msr, EAX_EDX_VAL(val, low, high), 0);
>>       return EAX_EDX_VAL(val, low, high);
>> @@ -119,7 +122,10 @@ static inline unsigned long long native_read_msr_safe(unsigned int msr,
>>  static inline void native_write_msr(unsigned int msr,
>>                                   unsigned low, unsigned high)
>>  {
>> -     asm volatile("wrmsr" : : "c" (msr), "a"(low), "d" (high) : "memory");
>> +     asm volatile("1: wrmsr\n"
>> +                  "2:\n"
>> +                  _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_wrmsr_unsafe)
>
> This might be a good idea:
>
> [    0.220066] cpuidle: using governor menu
> [    0.224000] ------------[ cut here ]------------
> [    0.224000] WARNING: CPU: 0 PID: 1 at arch/x86/mm/extable.c:74 ex_handler_wrmsr_unsafe+0x73/0x80()
> [    0.224000] unchecked MSR access error: WRMSR to 0xdeadbeef (tried to write 0x000000000000caca)
> [    0.224000] Modules linked in:
> [    0.224000] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.5.0-rc7+ #7
> [    0.224000] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.7.5-20140531_083030-gandalf 04/01/2014
> [    0.224000]  0000000000000000 ffff88007c0d7c08 ffffffff812f13a3 ffff88007c0d7c50
> [    0.224000]  ffffffff81a40ffe ffff88007c0d7c40 ffffffff8105c3b1 ffffffff81717710
> [    0.224000]  ffff88007c0d7d18 0000000000000000 ffffffff816207d0 0000000000000000
> [    0.224000] Call Trace:
> [    0.224000]  [<ffffffff812f13a3>] dump_stack+0x67/0x94
> [    0.224000]  [<ffffffff8105c3b1>] warn_slowpath_common+0x91/0xd0
> [    0.224000]  [<ffffffff816207d0>] ? amd_cpu_notify+0x40/0x40
> [    0.224000]  [<ffffffff8105c43c>] warn_slowpath_fmt+0x4c/0x50
> [    0.224000]  [<ffffffff816207d0>] ? amd_cpu_notify+0x40/0x40
> [    0.224000]  [<ffffffff8131de53>] ? __this_cpu_preempt_check+0x13/0x20
> [    0.224000]  [<ffffffff8104efe3>] ex_handler_wrmsr_unsafe+0x73/0x80
>
> and it looks helpful and all but when you do it pretty early - for
> example I added a
>
>          wrmsrl(0xdeadbeef, 0xcafe);
>
> at the end of pat_bsp_init() and the machine explodes with an early
> panic. I'm wondering what is better - early panic or an early #GP from a
> missing MSR.

You're hitting:

    /* special handling not supported during early boot */
    if (handler != ex_handler_default)
        return 0;


which means that the behavior with and without my series applied is
identical, for better or for worse.

>
> And more specifically, can we do better to handle the early case
> gracefully too?

We could probably remove that check and let custom fixups run early.
I don't see any compelling reason to keep them disabled.  That should
probably be a separate change, though.

--Andy
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andy Lutomirski March 14, 2016, 5:17 p.m. UTC | #3
On Mon, Mar 14, 2016 at 10:11 AM, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> On Mar 14, 2016 10:05 AM, "Andy Lutomirski" <luto@amacapital.net> wrote:
>>
>> We could probably remove that check and let custom fixups run early.
>> I don't see any compelling reason to keep them disabled.  That should
>> probably be a separate change, though.
>
> Or we could just use the existing wrmsr_safe() code and not add this new
> special code at all.
>
> Look, why are you doing this? We should get rid of the difference between
> wrmsr and wrmsr_safe(), not make it bigger.
>
> Just make everything safe. There has never in the history of anything been
> an advantage to making things oops and to making things more complicated.
>
> Why is rd/wrmsr() so magically important?

Because none of this is actually safe unless the code that calls
whatever_safe actually does something intelligent with the return
value.  I think that most code that does rdmsr or wrmsr actually
thinks "I know that this MSR exists and I want to access it" and the
code may actually malfunction if the access fails.  So I think we
really do want the warning so we can fix the bugs if they happen.  And
I wouldn't be at all surprised if there's a bug or two in perf where
some perf event registers itself incorrectly in some cases because it
messed up the probe sequence.  We don't want to totally ignore the
resulting failure of the perf event -- we want to get a warning so
that we know about the bug.

Or suppose we're writing some important but easy-to-miss MSR, like PAT
or one of the excessive number of system call setup MSRs.  If the
write fails, the system could easily still appear to work until
something unfortunate happens.

So yes, let's please warn.  I'm okay with removing the panic_on_oops
thing though.  (But if anyone suggests that we should stop OOPSing on
bad kernel page faults, I *will* fight back.)

--Andy
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Linus Torvalds March 14, 2016, 6:04 p.m. UTC | #4
On Mon, Mar 14, 2016 at 10:17 AM, Andy Lutomirski <luto@amacapital.net> wrote:
>
> So yes, let's please warn.  I'm okay with removing the panic_on_oops
> thing though.  (But if anyone suggests that we should stop OOPSing on
> bad kernel page faults, I *will* fight back.)

Bad kernel page faults are something completely different. They would
be actual bugs regardless.

The MSR thing has *often* been just silly "this CPU doesn't do that
MSR". Generic bootup setup code etc that just didn't know or care
about the particular badly documented rule for one particular random
CPU version and stepping.

In fact, when I say "often", I suspect I should really just say
"always". I don't think we've ever found a case where oopsing would
have been the right thing. But it has definitely caused lots of
problems, especially in the early paths where your code doesn't even
work right now.

Now, when it comes to the warning, I guess I could live with it, but I
think it's stupid to make this a low-level exception handler thing.

So what I think should be done:

 - make sure that wr/rdmsr_safe() actually works during very early
init. At some point it didn't.

 - get rid of the current wrmsr/rdmsr *entirely*. It's shit.

 - Add this wrapper:

      #define wrmsr(msr, low, high) \
        WARN_ON_ONCE(wrmsr_safe(msr, low, high))

and be done with it. We could even decide to make that WARN_ON_ONCE()
be something we could configure out, because it's really a debugging
thing and isn't like it should be all that fatal.

None of this insane complicated crap that buys us exactly *nothing*,
and depends on fancy new exception handling support etc etc.

So what's the downside to just doing this simple thing?

                      Linus
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andy Lutomirski March 14, 2016, 6:10 p.m. UTC | #5
On Mon, Mar 14, 2016 at 11:04 AM, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
> On Mon, Mar 14, 2016 at 10:17 AM, Andy Lutomirski <luto@amacapital.net> wrote:
>>
>> So yes, let's please warn.  I'm okay with removing the panic_on_oops
>> thing though.  (But if anyone suggests that we should stop OOPSing on
>> bad kernel page faults, I *will* fight back.)
>
> Bad kernel page faults are something completely different. They would
> be actual bugs regardless.
>
> The MSR thing has *often* been just silly "this CPU doesn't do that
> MSR". Generic bootup setup code etc that just didn't know or care
> about the particular badly documented rule for one particular random
> CPU version and stepping.
>
> In fact, when I say "often", I suspect I should really just say
> "always". I don't think we've ever found a case where oopsing would
> have been the right thing. But it has definitely caused lots of
> problems, especially in the early paths where your code doesn't even
> work right now.

I can fix that part by literally deleting a few lines of code.  I just
need to audit things to make sure that won't break anything.

>
> Now, when it comes to the warning, I guess I could live with it, but I
> think it's stupid to make this a low-level exception handler thing.
>
> So what I think should be done:
>
>  - make sure that wr/rdmsr_safe() actually works during very early
> init. At some point it didn't.
>
>  - get rid of the current wrmsr/rdmsr *entirely*. It's shit.
>
>  - Add this wrapper:
>
>       #define wrmsr(msr, low, high) \
>         WARN_ON_ONCE(wrmsr_safe(msr, low, high))
>
> and be done with it. We could even decide to make that WARN_ON_ONCE()
> be something we could configure out, because it's really a debugging
> thing and isn't like it should be all that fatal.
>
> None of this insane complicated crap that buys us exactly *nothing*,
> and depends on fancy new exception handling support etc etc.
>
> So what's the downside to just doing this simple thing?

More code size increase and extra branches on fast paths.  Using the
fancy new exception handling means we get to have the check and the
warning completely out of line.  In fact, I'm tempted to change the
_safe variants to use the fancy new handling as well (once it works in
early boot) to shorten the code size and remove some asm code.

A couple of the wrmsr users actually care about performance.  These
are the ones involved in context switching and, to a lesser extent, in
switching in and out of guest mode.

--Andy
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Linus Torvalds March 14, 2016, 6:10 p.m. UTC | #6
On Mon, Mar 14, 2016 at 11:04 AM, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> None of this insane complicated crap that buys us exactly *nothing*,
> and depends on fancy new exception handling support etc etc.

Actually, the one _new_ thing we could do is to instead of removing
the old crappy rdmsr()/wrmsr() implementation entirely, we'll just
rename it to "rd/wrmsr_unsafe()", and not have any exception table
stuff for that at all (so now you *will* get an oops and panic if you
use that).

The only reason to do that is for performance-critical MSR's that
really don't want any overhead. Sure, they could just be changed to
use "wrmsr_safe()", but for things like the APIC accesses, or
update_debugctlmsr() (that is supposed to check for processor version)
that can be truly critical, an explicitly _unsafe_ version may be the
right thing to do.

The fact is, the problem with rd/wrmsr is that we just did the
defaults the wrong way around. Making the simple and obvious version
be unsafe is just wrong.

              Linus
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Linus Torvalds March 14, 2016, 6:15 p.m. UTC | #7
On Mon, Mar 14, 2016 at 11:10 AM, Andy Lutomirski <luto@amacapital.net> wrote:
>
> A couple of the wrmsr users actually care about performance.  These
> are the ones involved in context switching and, to a lesser extent, in
> switching in and out of guest mode.

.. ok, see the crossed emails.

I'd *much* rather special case the special cases. Not make the generic
case something complex.

And *particularly* not make the generic case be something where people
think it's sane to oops and kill the machine. Christ.

                  Linus
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andy Lutomirski March 14, 2016, 6:24 p.m. UTC | #8
On Mon, Mar 14, 2016 at 11:15 AM, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
> On Mon, Mar 14, 2016 at 11:10 AM, Andy Lutomirski <luto@amacapital.net> wrote:
>>
>> A couple of the wrmsr users actually care about performance.  These
>> are the ones involved in context switching and, to a lesser extent, in
>> switching in and out of guest mode.
>
> .. ok, see the crossed emails.
>
> I'd *much* rather special case the special cases. Not make the generic
> case something complex.

The code in my queue is, literally:

bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup,
                 struct pt_regs *regs, int trapnr)
{
    WARN_ONCE(1, "unchecked MSR access error: RDMSR from 0x%x",
          (unsigned int)regs->cx);

    /* Pretend that the read succeeded and returned 0. */
    regs->ip = ex_fixup_addr(fixup);
    regs->ax = 0;
    regs->dx = 0;
    return true;
}
EXPORT_SYMBOL(ex_handler_rdmsr_unsafe);

The only regard in which this is any more complex than the _safe
variant is because there's a warning (one line of code) and because
the _safe variant forgot to zero the result (two lines of code).  My
series fixes the latter, so we're talking about almost no source code
size difference.  There *is* a difference in binary size, though --
the _safe variant emits a copy of its fixup every time it appears,
whereas the new fixup appears once.

So I think we should apply my patches (with the early handling fixed
and the panic_on_oops removed), and then consider reimplementing the
_safe variant using fancy handlers to reduce number of lines of asm
and code size, and then consider changing the overall API on top if we
think there's a better API to be had.

Is that okay?

>
> And *particularly* not make the generic case be something where people
> think it's sane to oops and kill the machine. Christ.

I've already removed the panic_on_oops thing from my tree.

--Andy
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Linus Torvalds March 14, 2016, 6:40 p.m. UTC | #9
On Mon, Mar 14, 2016 at 11:24 AM, Andy Lutomirski <luto@amacapital.net> wrote:
>
> The code in my queue is, literally:
>
> bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup,
>                  struct pt_regs *regs, int trapnr)
> {
>     WARN_ONCE(1, "unchecked MSR access error: RDMSR from 0x%x",
>           (unsigned int)regs->cx);
>
>     /* Pretend that the read succeeded and returned 0. */
>     regs->ip = ex_fixup_addr(fixup);
>     regs->ax = 0;
>     regs->dx = 0;
>     return true;
> }
> EXPORT_SYMBOL(ex_handler_rdmsr_unsafe);

I guess I can live with this, as long as we also extend the
early-fault handling to work with the special exception handlers.

And as long as people start understanding that killing the machine is
a bad bad bad thing. It's a debugging nightmare.

               Linus
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andy Lutomirski March 14, 2016, 6:48 p.m. UTC | #10
On Mon, Mar 14, 2016 at 11:40 AM, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
> On Mon, Mar 14, 2016 at 11:24 AM, Andy Lutomirski <luto@amacapital.net> wrote:
>>
>> The code in my queue is, literally:
>>
>> bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup,
>>                  struct pt_regs *regs, int trapnr)
>> {
>>     WARN_ONCE(1, "unchecked MSR access error: RDMSR from 0x%x",
>>           (unsigned int)regs->cx);
>>
>>     /* Pretend that the read succeeded and returned 0. */
>>     regs->ip = ex_fixup_addr(fixup);
>>     regs->ax = 0;
>>     regs->dx = 0;
>>     return true;
>> }
>> EXPORT_SYMBOL(ex_handler_rdmsr_unsafe);
>
> I guess I can live with this, as long as we also extend the
> early-fault handling to work with the special exception handlers.

OK, will do.  I need to rewrork the early IDT code a bit so it
generates a real pt_regs layout, but that's arguably a cleanup anyway.

--Andy
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Peter Zijlstra March 14, 2016, 8:18 p.m. UTC | #11
On Mon, Mar 14, 2016 at 11:10:16AM -0700, Andy Lutomirski wrote:
> A couple of the wrmsr users actually care about performance.  These
> are the ones involved in context switching and, to a lesser extent, in
> switching in and out of guest mode.

Right, this very much includes a number of perf MSRs. Some of those get
called from the context switch path, others from the NMI handler.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ingo Molnar March 15, 2016, 10:21 a.m. UTC | #12
* Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Mon, Mar 14, 2016 at 10:17 AM, Andy Lutomirski <luto@amacapital.net> wrote:
> >
> > So yes, let's please warn.  I'm okay with removing the panic_on_oops
> > thing though.  (But if anyone suggests that we should stop OOPSing on
> > bad kernel page faults, I *will* fight back.)
> 
> Bad kernel page faults are something completely different. They would
> be actual bugs regardless.
> 
> The MSR thing has *often* been just silly "this CPU doesn't do that
> MSR". Generic bootup setup code etc that just didn't know or care
> about the particular badly documented rule for one particular random
> CPU version and stepping.
> 
> In fact, when I say "often", I suspect I should really just say
> "always". I don't think we've ever found a case where oopsing would
> have been the right thing. But it has definitely caused lots of
> problems, especially in the early paths where your code doesn't even
> work right now.
> 
> Now, when it comes to the warning, I guess I could live with it, but I
> think it's stupid to make this a low-level exception handler thing.
> 
> So what I think should be done:
> 
>  - make sure that wr/rdmsr_safe() actually works during very early
> init. At some point it didn't.
> 
>  - get rid of the current wrmsr/rdmsr *entirely*. It's shit.
> 
>  - Add this wrapper:
> 
>       #define wrmsr(msr, low, high) \
>         WARN_ON_ONCE(wrmsr_safe(msr, low, high))
> 
> and be done with it. We could even decide to make that WARN_ON_ONCE()
> be something we could configure out, because it's really a debugging
> thing and isn't like it should be all that fatal.
> 
> None of this insane complicated crap that buys us exactly *nothing*,
> and depends on fancy new exception handling support etc etc.
> 
> So what's the downside to just doing this simple thing?

This was my original suggestion as well.

The objection was that sometimes it's performance critical. To which I replied 
that 1) I highly doubt that a single extra branch of the check is measurable 2) 
and even if so, then _those_ performance critical cases should be done in some 
special way (rdmsr_nocheck() or whatever) - at which point the argument was that 
there's a lot of such cases.

Which final line of argument I still don't really buy, btw., but it became a me 
against everyone else argument and I cowardly punted.

Now that the 800 lb gorilla is on my side I of course stand my ground, principles 
are principles!

Thanks,

	Ingo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ingo Molnar March 15, 2016, 10:22 a.m. UTC | #13
* Andy Lutomirski <luto@amacapital.net> wrote:

> On Mon, Mar 14, 2016 at 11:40 AM, Linus Torvalds
> <torvalds@linux-foundation.org> wrote:
> > On Mon, Mar 14, 2016 at 11:24 AM, Andy Lutomirski <luto@amacapital.net> wrote:
> >>
> >> The code in my queue is, literally:
> >>
> >> bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup,
> >>                  struct pt_regs *regs, int trapnr)
> >> {
> >>     WARN_ONCE(1, "unchecked MSR access error: RDMSR from 0x%x",
> >>           (unsigned int)regs->cx);
> >>
> >>     /* Pretend that the read succeeded and returned 0. */
> >>     regs->ip = ex_fixup_addr(fixup);
> >>     regs->ax = 0;
> >>     regs->dx = 0;
> >>     return true;
> >> }
> >> EXPORT_SYMBOL(ex_handler_rdmsr_unsafe);
> >
> > I guess I can live with this, as long as we also extend the
> > early-fault handling to work with the special exception handlers.
> 
> OK, will do.  I need to rewrork the early IDT code a bit so it
> generates a real pt_regs layout, but that's arguably a cleanup anyway.

Ok, with that's I'm pretty happy about it as well.

Thanks,

	Ingo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ingo Molnar March 15, 2016, 10:26 a.m. UTC | #14
* Ingo Molnar <mingo@kernel.org> wrote:

> * Andy Lutomirski <luto@amacapital.net> wrote:
> 
> > On Mon, Mar 14, 2016 at 11:40 AM, Linus Torvalds
> > <torvalds@linux-foundation.org> wrote:
> > > On Mon, Mar 14, 2016 at 11:24 AM, Andy Lutomirski <luto@amacapital.net> wrote:
> > >>
> > >> The code in my queue is, literally:
> > >>
> > >> bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup,
> > >>                  struct pt_regs *regs, int trapnr)
> > >> {
> > >>     WARN_ONCE(1, "unchecked MSR access error: RDMSR from 0x%x",
> > >>           (unsigned int)regs->cx);
> > >>
> > >>     /* Pretend that the read succeeded and returned 0. */
> > >>     regs->ip = ex_fixup_addr(fixup);
> > >>     regs->ax = 0;
> > >>     regs->dx = 0;
> > >>     return true;
> > >> }
> > >> EXPORT_SYMBOL(ex_handler_rdmsr_unsafe);
> > >
> > > I guess I can live with this, as long as we also extend the
> > > early-fault handling to work with the special exception handlers.
> > 
> > OK, will do.  I need to rewrork the early IDT code a bit so it
> > generates a real pt_regs layout, but that's arguably a cleanup anyway.
> 
> Ok, with that's I'm pretty happy about it as well.

Note, I think it's pretty clear at this point that this is v4.7 material.

Thanks,

	Ingo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 93fb7c1cffda..1487054a1a70 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -92,7 +92,10 @@  static inline unsigned long long native_read_msr(unsigned int msr)
 {
 	DECLARE_ARGS(val, low, high);
 
-	asm volatile("rdmsr" : EAX_EDX_RET(val, low, high) : "c" (msr));
+	asm volatile("1: rdmsr\n"
+		     "2:\n"
+		     _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_rdmsr_unsafe)
+		     : EAX_EDX_RET(val, low, high) : "c" (msr));
 	if (msr_tracepoint_active(__tracepoint_read_msr))
 		do_trace_read_msr(msr, EAX_EDX_VAL(val, low, high), 0);
 	return EAX_EDX_VAL(val, low, high);
@@ -119,7 +122,10 @@  static inline unsigned long long native_read_msr_safe(unsigned int msr,
 static inline void native_write_msr(unsigned int msr,
 				    unsigned low, unsigned high)
 {
-	asm volatile("wrmsr" : : "c" (msr), "a"(low), "d" (high) : "memory");
+	asm volatile("1: wrmsr\n"
+		     "2:\n"
+		     _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_wrmsr_unsafe)
+		     : : "c" (msr), "a"(low), "d" (high) : "memory");
 	if (msr_tracepoint_active(__tracepoint_read_msr))
 		do_trace_write_msr(msr, ((u64)high << 32 | low), 0);
 }
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 9dd7e4b7fcde..9e6749b34524 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -49,6 +49,39 @@  bool ex_handler_ext(const struct exception_table_entry *fixup,
 }
 EXPORT_SYMBOL(ex_handler_ext);
 
+bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup,
+			     struct pt_regs *regs, int trapnr)
+{
+	WARN_ONCE(1, "unchecked MSR access error: RDMSR from 0x%x",
+		  (unsigned int)regs->cx);
+
+	/* If panic_on_oops is set, don't try to recover. */
+	if (panic_on_oops)
+		return false;
+
+	regs->ip = ex_fixup_addr(fixup);
+	regs->ax = 0;
+	regs->dx = 0;
+	return true;
+}
+EXPORT_SYMBOL(ex_handler_rdmsr_unsafe);
+
+bool ex_handler_wrmsr_unsafe(const struct exception_table_entry *fixup,
+			     struct pt_regs *regs, int trapnr)
+{
+	WARN_ONCE(1, "unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x)\n",
+		  (unsigned int)regs->cx,
+		  (unsigned int)regs->dx, (unsigned int)regs->ax);
+
+	/* If panic_on_oops is set, don't try to recover. */
+	if (panic_on_oops)
+		return false;
+
+	regs->ip = ex_fixup_addr(fixup);
+	return true;
+}
+EXPORT_SYMBOL(ex_handler_wrmsr_unsafe);
+
 bool ex_has_fault_handler(unsigned long ip)
 {
 	const struct exception_table_entry *e;