diff mbox

[RFC,09/10] x86/enter: Create macros to restrict/unrestrict Indirect Branch Speculation

Message ID 1516476182-5153-10-git-send-email-karahmed@amazon.de (mailing list archive)
State New, archived
Headers show

Commit Message

KarimAllah Ahmed Jan. 20, 2018, 7:23 p.m. UTC
From: Tim Chen <tim.c.chen@linux.intel.com>

Create macros to control Indirect Branch Speculation.

Name them so they reflect what they are actually doing.
The macros are used to restrict and unrestrict the indirect branch speculation.
They do not *disable* (or *enable*) indirect branch speculation. A trip back to
user-space after *restricting* speculation would still affect the BTB.

Quoting from a commit by Tim Chen:

"""
    If IBRS is set, near returns and near indirect jumps/calls will not allow
    their predicted target address to be controlled by code that executed in a
    less privileged prediction mode *BEFORE* the IBRS mode was last written with
    a value of 1 or on another logical processor so long as all Return Stack
    Buffer (RSB) entries from the previous less privileged prediction mode are
    overwritten.

    Thus a near indirect jump/call/return may be affected by code in a less
    privileged prediction mode that executed *AFTER* IBRS mode was last written
    with a value of 1.
"""

[ tglx: Changed macro names and rewrote changelog ]
[ karahmed: changed macro names *again* and rewrote changelog ]

Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: KarimAllah Ahmed <karahmed@amazon.de>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Arjan Van De Ven <arjan.van.de.ven@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: David Woodhouse <dwmw@amazon.co.uk>
Cc: Ashok Raj <ashok.raj@intel.com>
Link: https://lkml.kernel.org/r/3aab341725ee6a9aafd3141387453b45d788d61a.1515542293.git.tim.c.chen@linux.intel.com
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 arch/x86/entry/calling.h | 73 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)

Comments

Andy Lutomirski Jan. 21, 2018, 7:14 p.m. UTC | #1
> On Jan 20, 2018, at 11:23 AM, KarimAllah Ahmed <karahmed@amazon.de> wrote:
> 
> From: Tim Chen <tim.c.chen@linux.intel.com>
> 
> Create macros to control Indirect Branch Speculation.
> 
> Name them so they reflect what they are actually doing.
> The macros are used to restrict and unrestrict the indirect branch speculation.
> They do not *disable* (or *enable*) indirect branch speculation. A trip back to
> user-space after *restricting* speculation would still affect the BTB.
> 
> Quoting from a commit by Tim Chen:
> 
> """
>    If IBRS is set, near returns and near indirect jumps/calls will not allow
>    their predicted target address to be controlled by code that executed in a
>    less privileged prediction mode *BEFORE* the IBRS mode was last written with
>    a value of 1 or on another logical processor so long as all Return Stack
>    Buffer (RSB) entries from the previous less privileged prediction mode are
>    overwritten.
> 
>    Thus a near indirect jump/call/return may be affected by code in a less
>    privileged prediction mode that executed *AFTER* IBRS mode was last written
>    with a value of 1.
> """
> 
> [ tglx: Changed macro names and rewrote changelog ]
> [ karahmed: changed macro names *again* and rewrote changelog ]
> 
> Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> Signed-off-by: KarimAllah Ahmed <karahmed@amazon.de>
> Cc: Andrea Arcangeli <aarcange@redhat.com>
> Cc: Andi Kleen <ak@linux.intel.com>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Greg KH <gregkh@linuxfoundation.org>
> Cc: Dave Hansen <dave.hansen@intel.com>
> Cc: Andy Lutomirski <luto@kernel.org>
> Cc: Paolo Bonzini <pbonzini@redhat.com>
> Cc: Dan Williams <dan.j.williams@intel.com>
> Cc: Arjan Van De Ven <arjan.van.de.ven@intel.com>
> Cc: Linus Torvalds <torvalds@linux-foundation.org>
> Cc: David Woodhouse <dwmw@amazon.co.uk>
> Cc: Ashok Raj <ashok.raj@intel.com>
> Link: https://lkml.kernel.org/r/3aab341725ee6a9aafd3141387453b45d788d61a.1515542293.git.tim.c.chen@linux.intel.com
> Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
> ---
> arch/x86/entry/calling.h | 73 ++++++++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 73 insertions(+)
> 
> diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
> index 3f48f69..5aafb51 100644
> --- a/arch/x86/entry/calling.h
> +++ b/arch/x86/entry/calling.h
> @@ -6,6 +6,8 @@
> #include <asm/percpu.h>
> #include <asm/asm-offsets.h>
> #include <asm/processor-flags.h>
> +#include <asm/msr-index.h>
> +#include <asm/cpufeatures.h>
> 
> /*
> 
> @@ -349,3 +351,74 @@ For 32-bit we have the following conventions - kernel is built with
> .Lafter_call_\@:
> #endif
> .endm
> +
> +/*
> + * IBRS related macros
> + */
> +.macro PUSH_MSR_REGS
> +    pushq    %rax
> +    pushq    %rcx
> +    pushq    %rdx
> +.endm
> +
> +.macro POP_MSR_REGS
> +    popq    %rdx
> +    popq    %rcx
> +    popq    %rax
> +.endm
> +
> +.macro WRMSR_ASM msr_nr:req edx_val:req eax_val:req
> +    movl    \msr_nr, %ecx
> +    movl    \edx_val, %edx
> +    movl    \eax_val, %eax
> +    wrmsr
> +.endm
> +
> +.macro RESTRICT_IB_SPEC
> +    ALTERNATIVE "jmp .Lskip_\@", "", X86_FEATURE_IBRS
> +    PUSH_MSR_REGS
> +    WRMSR_ASM $MSR_IA32_SPEC_CTRL, $0, $SPEC_CTRL_IBRS
> +    POP_MSR_REGS
> +.Lskip_\@:
> +.endm
> +
> +.macro UNRESTRICT_IB_SPEC
> +    ALTERNATIVE "jmp .Lskip_\@", "", X86_FEATURE_IBRS
> +    PUSH_MSR_REGS
> +    WRMSR_ASM $MSR_IA32_SPEC_CTRL, $0, $0

I think you should be writing 2, not 0, since I'm reasonably confident that we want STIBP on.  Can you explain why you're writing 0?

Also, holy cow, there are so many macros here.

And a meta question: why are there so many submitters of the same series?
David Woodhouse Jan. 21, 2018, 8:28 p.m. UTC | #2
On Sun, 2018-01-21 at 11:34 -0800, Linus Torvalds wrote:
> All of this is pure garbage.
> 
> Is Intel really planning on making this shit architectural? Has
> anybody talked to them and told them they are f*cking insane?
> 
> Please, any Intel engineers here - talk to your managers. 

If the alternative was a two-decade product recall and giving everyone
free CPUs, I'm not sure it was entirely insane.

Certainly it's a nasty hack, but hey — the world was on fire and in the
end we didn't have to just turn the datacentres off and go back to goat
farming, so it's not all bad.

As a hack for existing CPUs, it's just about tolerable — as long as it
can die entirely by the next generation.

So the part is I think is odd is the IBRS_ALL feature, where a future
CPU will advertise "I am able to be not broken" and then you have to
set the IBRS bit once at boot time to *ask* it not to be broken. That
part is weird, because it ought to have been treated like the RDCL_NO
bit — just "you don't have to worry any more, it got better".

https://software.intel.com/sites/default/files/managed/c5/63/336996-Speculative-Execution-Side-Channel-Mitigations.pdf

We do need the IBPB feature to complete the protection that retpoline
gives us — it's that or rebuild all of userspace with retpoline.

We'll also want to expose IBRS to VM guests, since Windows uses it.

I think we could probably live without the IBRS frobbing in our own
syscall/interrupt paths, as long as we're prepared to live with the
very hypothetical holes that still exist on Skylake. Because I like
IBRS more... no, let me rephrase... I hate IBRS less than I hate the
'deepstack' and other stuff that was being proposed to make Skylake
almost safe with retpoline.
Linus Torvalds Jan. 21, 2018, 9:35 p.m. UTC | #3
On Sun, Jan 21, 2018 at 12:28 PM, David Woodhouse <dwmw2@infradead.org> wrote:
> On Sun, 2018-01-21 at 11:34 -0800, Linus Torvalds wrote:
>> All of this is pure garbage.
>>
>> Is Intel really planning on making this shit architectural? Has
>> anybody talked to them and told them they are f*cking insane?
>>
>> Please, any Intel engineers here - talk to your managers.
>
> If the alternative was a two-decade product recall and giving everyone
> free CPUs, I'm not sure it was entirely insane.

You seem to have bought into the cool-aid. Please add a healthy dose
of critical thinking. Because this isn't the kind of cool-aid that
makes for a fun trip with pretty pictures. This is the kind that melts
your brain.

> Certainly it's a nasty hack, but hey — the world was on fire and in the
> end we didn't have to just turn the datacentres off and go back to goat
> farming, so it's not all bad.

It's not that it's a nasty hack. It's much worse than that.

> As a hack for existing CPUs, it's just about tolerable — as long as it
> can die entirely by the next generation.

That's part of the big problem here. The speculation control cpuid
stuff shows that Intel actually seems to plan on doing the right thing
for meltdown (the main question being _when_). Which is not a huge
surprise, since it should be easy to fix, and it's a really honking
big hole to drive through. Not doing the right thing for meltdown
would be completely unacceptable.

So the IBRS garbage implies that Intel is _not_ planning on doing the
right thing for the indirect branch speculation.

Honestly, that's completely unacceptable too.

> So the part is I think is odd is the IBRS_ALL feature, where a future
> CPU will advertise "I am able to be not broken" and then you have to
> set the IBRS bit once at boot time to *ask* it not to be broken. That
> part is weird, because it ought to have been treated like the RDCL_NO
> bit — just "you don't have to worry any more, it got better".

It's not "weird" at all. It's very much part of the whole "this is
complete garbage" issue.

The whole IBRS_ALL feature to me very clearly says "Intel is not
serious about this, we'll have a ugly hack that will be so expensive
that we don't want to enable it by default, because that would look
bad in benchmarks".

So instead they try to push the garbage down to us. And they are doing
it entirely wrong, even from a technical standpoint.

I'm sure there is some lawyer there who says "we'll have to go through
motions to protect against a lawsuit". But legal reasons do not make
for good technology, or good patches that I should apply.

> We do need the IBPB feature to complete the protection that retpoline
> gives us — it's that or rebuild all of userspace with retpoline.

BULLSHIT.

Have you _looked_ at the patches you are talking about?  You should
have - several of them bear your name.

The patches do things like add the garbage MSR writes to the kernel
entry/exit points. That's insane. That says "we're trying to protect
the kernel".  We already have retpoline there, with less overhead.

So somebody isn't telling the truth here. Somebody is pushing complete
garbage for unclear reasons. Sorry for having to point that out.

If this was about flushing the BTB at actual context switches between
different users, I'd believe you. But that's not at all what the
patches do.

As it is, the patches  are COMPLETE AND UTTER GARBAGE.

They do literally insane things. They do things that do not make
sense. That makes all your arguments questionable and suspicious. The
patches do things that are not sane.

WHAT THE F*CK IS GOING ON?

And that's actually ignoring the much _worse_ issue, namely that the
whole hardware interface is literally mis-designed by morons.

It's mis-designed for two major reasons:

 - the "the interface implies Intel will never fix it" reason.

   See the difference between IBRS_ALL and RDCL_NO. One implies Intel
will fix something. The other does not.

   Do you really think that is acceptable?

 - the "there is no performance indicator".

   The whole point of having cpuid and flags from the
microarchitecture is that we can use those to make decisions.

   But since we already know that the IBRS overhead is <i>huge</i> on
existing hardware, all those hardware capability bits are just
complete and utter garbage. Nobody sane will use them, since the cost
is too damn high. So you end up having to look at "which CPU stepping
is this" anyway.

I think we need something better than this garbage.

                Linus
David Woodhouse Jan. 21, 2018, 10 p.m. UTC | #4
On Sun, 2018-01-21 at 13:35 -0800, Linus Torvalds wrote:
> On Sun, Jan 21, 2018 at 12:28 PM, David Woodhouse  wrote:
> > As a hack for existing CPUs, it's just about tolerable — as long as it
> > can die entirely by the next generation.
>
> That's part of the big problem here. The speculation control cpuid
> stuff shows that Intel actually seems to plan on doing the right thing
> for meltdown (the main question being _when_). Which is not a huge
> surprise, since it should be easy to fix, and it's a really honking
> big hole to drive through. Not doing the right thing for meltdown
> would be completely unacceptable.
> 
> So the IBRS garbage implies that Intel is _not_ planning on doing the
> right thing for the indirect branch speculation.
> 
> Honestly, that's completely unacceptable too.

Agreed. I've been saying that since I first saw the IBRS_ALL proposal.
There's *no* good reason for it to be opt-in. Just fix it!

> > So the part is I think is odd is the IBRS_ALL feature, where a future
> > CPU will advertise "I am able to be not broken" and then you have to
> > set the IBRS bit once at boot time to *ask* it not to be broken. That
> > part is weird, because it ought to have been treated like the RDCL_NO
> > bit — just "you don't have to worry any more, it got better".
>
> It's not "weird" at all. It's very much part of the whole "this is
> complete garbage" issue.
> 
> The whole IBRS_ALL feature to me very clearly says "Intel is not
> serious about this, we'll have a ugly hack that will be so expensive
> that we don't want to enable it by default, because that would look
> bad in benchmarks".
> 
> So instead they try to push the garbage down to us. And they are doing
> it entirely wrong, even from a technical standpoint.

Right. The whole IBRS/IBPB thing as a nasty hack in the short term I
could live with, but it's the long-term implications of IBRS_ALL that
I'm unhappy about.

My understanding was that the IBRS_ALL performance was supposed to not
suck — to the extent that we'd just turn it on and then ALTERNATIVE out
the retpolines, and that would be the best option.

But if that's the case, why are they making it an option, and not just
doing the same as RDCL_NO does for "we fixed Meltdown"?

> > We do need the IBPB feature to complete the protection that retpoline
> > gives us — it's that or rebuild all of userspace with retpoline.
>
> BULLSHIT.
> 
> Have you _looked_ at the patches you are talking about?  You should
> have - several of them bear your name.
> 
> The patches do things like add the garbage MSR writes to the kernel
> entry/exit points. That's insane. That says "we're trying to protect
> the kernel".  We already have retpoline there, with less overhead.

You're looking at IBRS usage, not IBPB. They are different things.

Yes, the one you're looking at really *is* trying to protect the
kernel, and you're right that it's largely redundant with retpoline.
(Assuming we can live with the implications on Skylake, as I said.)

> If this was about flushing the BTB at actual context switches between
> different users, I'd believe you. But that's not at all what the
> patches do.

That's what the *IBPB* patches do. Those were deliberately put first in
the series (and in fact that's where I stopped, when I posted).
Linus Torvalds Jan. 21, 2018, 10:27 p.m. UTC | #5
On Sun, Jan 21, 2018 at 2:00 PM, David Woodhouse <dwmw2@infradead.org> wrote:
>>
>> The patches do things like add the garbage MSR writes to the kernel
>> entry/exit points. That's insane. That says "we're trying to protect
>> the kernel".  We already have retpoline there, with less overhead.
>
> You're looking at IBRS usage, not IBPB. They are different things.

Ehh. Odd intel naming detail.

If you look at this series, it very much does that kernel entry/exit
stuff. It was patch 10/10, iirc. In fact, the patch I was replying to
was explicitly setting that garbage up.

And I really don't want to see these garbage patches just mindlessly
sent around.

                  Linus
David Woodhouse Jan. 22, 2018, 4:27 p.m. UTC | #6
On Sun, 2018-01-21 at 14:27 -0800, Linus Torvalds wrote:
> On Sun, Jan 21, 2018 at 2:00 PM, David Woodhouse <dwmw2@infradead.org> wrote:
> >>
> >> The patches do things like add the garbage MSR writes to the kernel
> >> entry/exit points. That's insane. That says "we're trying to protect
> >> the kernel".  We already have retpoline there, with less overhead.
> >
> > You're looking at IBRS usage, not IBPB. They are different things.
> 
> Ehh. Odd intel naming detail.
> 
> If you look at this series, it very much does that kernel entry/exit
> stuff. It was patch 10/10, iirc. In fact, the patch I was replying to
> was explicitly setting that garbage up.
> 
> And I really don't want to see these garbage patches just mindlessly
> sent around.

I think we've covered the technical part of this now, not that you like
it — not that any of us *like* it. But since the peanut gallery is
paying lots of attention it's probably worth explaining it a little
more for their benefit.

This is all about Spectre variant 2, where the CPU can be tricked into
mispredicting the target of an indirect branch. And I'm specifically
looking at what we can do on *current* hardware, where we're limited to
the hacks they can manage to add in the microcode.

The new microcode from Intel and AMD adds three new features.

One new feature (IBPB) is a complete barrier for branch prediction.
After frobbing this, no branch targets learned earlier are going to be
used. It's kind of expensive (order of magnitude ~4000 cycles).

The second (STIBP) protects a hyperthread sibling from following branch
predictions which were learned on another sibling. You *might* want
this when running unrelated processes in userspace, for example. Or
different VM guests running on HT siblings.

The third feature (IBRS) is more complicated. It's designed to be
set when you enter a more privileged execution mode (i.e. the kernel).
It prevents branch targets learned in a less-privileged execution mode,
BEFORE IT WAS MOST RECENTLY SET, from taking effect. But it's not just
a 'set-and-forget' feature, it also has barrier-like semantics and
needs to be set on *each* entry into the kernel (from userspace or a VM
guest). It's *also* expensive. And a vile hack, but for a while it was
the only option we had.

Even with IBRS, the CPU cannot tell the difference between different
userspace processes, and between different VM guests. So in addition to
IBRS to protect the kernel, we need the full IBPB barrier on context
switch and vmexit. And maybe STIBP while they're running.

Then along came Paul with the cunning plan of "oh, indirect branches
can be exploited? Screw it, let's not have any of *those* then", which
is retpoline. And it's a *lot* faster than frobbing IBRS on every entry
into the kernel. It's a massive performance win.

So now we *mostly* don't need IBRS. We build with retpoline, use IBPB
on context switches/vmexit (which is in the first part of this patch
series before IBRS is added), and we're safe. We even refactored the
patch series to put retpoline first.

But wait, why did I say "mostly"? Well, not everyone has a retpoline
compiler yet... but OK, screw them; they need to update.

Then there's Skylake, and that generation of CPU cores. For complicated
reasons they actually end up being vulnerable not just on indirect
branches, but also on a 'ret' in some circumstances (such as 16+ CALLs
in a deep chain).

The IBRS solution, ugly though it is, did address that. Retpoline
doesn't. There are patches being floated to detect and prevent deep
stacks, and deal with some of the other special cases that bite on SKL,
but those are icky too. And in fact IBRS performance isn't anywhere
near as bad on this generation of CPUs as it is on earlier CPUs
*anyway*, which makes it not quite so insane to *contemplate* using it
as Intel proposed.

That's why my initial idea, as implemented in this RFC patchset, was to
stick with IBRS on Skylake, and use retpoline everywhere else. I'll
give you "garbage patches", but they weren't being "just mindlessly
sent around". If we're going to drop IBRS support and accept the
caveats, then let's do it as a conscious decision having seen what it
would look like, not just drop it quietly because poor Davey is too
scared that Linus might shout at him again. :)

I have seen *hand-wavy* analyses of the Skylake thing that mean I'm not
actually lying awake at night fretting about it, but nothing concrete
that really says it's OK.

If you view retpoline as a performance optimisation, which is how it
first arrived, then it's rather unconventional to say "well, it only
opens a *little* bit of a security hole but it does go nice and fast so
let's do it".

But fine, I'm content with ditching the use of IBRS to protect the
kernel, and I'm not even surprised. There's a *reason* we put it last
in the series, as both the most contentious and most dispensable part.
I'd be *happier* with a coherent analysis showing Skylake is still OK,
but hey-ho, screw Skylake.

The early part of the series adds the new feature bits and detects when
it can turn KPTI off on non-Meltdown-vulnerable Intel CPUs, and also
supports the IBPB barrier that we need to make retpoline complete. That
much I think we definitely *do* want. There have been a bunch of us
working on this behind the scenes; one of us will probably post that
bit in the next day or so.

I think we also want to expose IBRS to VM guests, even if we don't use
it ourselves. Because Windows guests (and RHEL guests; yay!) do use it.

If we can be done with the shouty part, I'd actually quite like to have
a sensible discussion about when, if ever, we do IBPB on context switch
(ptraceability and dumpable have both been suggested) and when, if
ever, we set STIPB in userspace.
Ingo Molnar Jan. 23, 2018, 7:29 a.m. UTC | #7
* David Woodhouse <dwmw2@infradead.org> wrote:

> But wait, why did I say "mostly"? Well, not everyone has a retpoline
> compiler yet... but OK, screw them; they need to update.
> 
> Then there's Skylake, and that generation of CPU cores. For complicated
> reasons they actually end up being vulnerable not just on indirect
> branches, but also on a 'ret' in some circumstances (such as 16+ CALLs
> in a deep chain).
> 
> The IBRS solution, ugly though it is, did address that. Retpoline
> doesn't. There are patches being floated to detect and prevent deep
> stacks, and deal with some of the other special cases that bite on SKL,
> but those are icky too. And in fact IBRS performance isn't anywhere
> near as bad on this generation of CPUs as it is on earlier CPUs
> *anyway*, which makes it not quite so insane to *contemplate* using it
> as Intel proposed.

There's another possible method to avoid deep stacks on Skylake, without compiler 
support:

  - Use the existing mcount based function tracing live patching machinery
    (CONFIG_FUNCTION_TRACER=y) to install a _very_ fast and simple stack depth 
    tracking tracer which would issue a retpoline when stack depth crosses 
    boundaries of ~16 entries.

The overhead of that would _still_ very likely be much cheaper than a hundreds 
(thousands) of cycle expensive MSR write at every kernel entry (syscall entry, IRQ 
entry, etc.).

Note the huge number of advantages:

 - All distro kernels already enable the mcount based patching options, so there's
   literally zero overhead to anything except SkyLake.

 - It is fully kernel patching based and can be activated on Skylake only

 - It doesn't require any microcode updates, so it will work on all existing CPUs
   with no firmware or microcode modificatons

 - It doesn't require any compiler updates

 - SkyLake performance is very likely to be much less fragile than relying on a 
   hastily deployed microcode hack

 - The "SkyLake stack depth tracer" can be tested on other CPUs as well in debug 
   builds, broadening the testing base

 - The tracer is very obviously simple and reviewable, and we can forget about it
   in the far future.

 - It's much more backportable to older kernels: should there be a new class of
   exploits then this machinery could be updated to cover that too - while 
   upgrades to newer kernels would give the higher performant solution.

Yes, there are some practical complications like always enabling 
CONFIG_FUNCTION_TRACER=y on x86, plus the ftrace interaction has to be sorted out, 
but in practice it's enabled on all major distros anyway, due to ftrace.

Is there any reason why this wouldn't work?

Thanks,

	Ingo
Tom Lendacky Jan. 23, 2018, 4:12 p.m. UTC | #8
On 1/21/2018 1:14 PM, Andy Lutomirski wrote:
> 
> 
>> On Jan 20, 2018, at 11:23 AM, KarimAllah Ahmed <karahmed@amazon.de> wrote:
>>
>> From: Tim Chen <tim.c.chen@linux.intel.com>
>>
>> Create macros to control Indirect Branch Speculation.
>>
>> Name them so they reflect what they are actually doing.
>> The macros are used to restrict and unrestrict the indirect branch speculation.
>> They do not *disable* (or *enable*) indirect branch speculation. A trip back to
>> user-space after *restricting* speculation would still affect the BTB.
>>
>> Quoting from a commit by Tim Chen:
>>
>> """
>>    If IBRS is set, near returns and near indirect jumps/calls will not allow
>>    their predicted target address to be controlled by code that executed in a
>>    less privileged prediction mode *BEFORE* the IBRS mode was last written with
>>    a value of 1 or on another logical processor so long as all Return Stack
>>    Buffer (RSB) entries from the previous less privileged prediction mode are
>>    overwritten.
>>
>>    Thus a near indirect jump/call/return may be affected by code in a less
>>    privileged prediction mode that executed *AFTER* IBRS mode was last written
>>    with a value of 1.
>> """
>>
>> [ tglx: Changed macro names and rewrote changelog ]
>> [ karahmed: changed macro names *again* and rewrote changelog ]
>>
>> Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
>> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
>> Signed-off-by: KarimAllah Ahmed <karahmed@amazon.de>
>> Cc: Andrea Arcangeli <aarcange@redhat.com>
>> Cc: Andi Kleen <ak@linux.intel.com>
>> Cc: Peter Zijlstra <peterz@infradead.org>
>> Cc: Greg KH <gregkh@linuxfoundation.org>
>> Cc: Dave Hansen <dave.hansen@intel.com>
>> Cc: Andy Lutomirski <luto@kernel.org>
>> Cc: Paolo Bonzini <pbonzini@redhat.com>
>> Cc: Dan Williams <dan.j.williams@intel.com>
>> Cc: Arjan Van De Ven <arjan.van.de.ven@intel.com>
>> Cc: Linus Torvalds <torvalds@linux-foundation.org>
>> Cc: David Woodhouse <dwmw@amazon.co.uk>
>> Cc: Ashok Raj <ashok.raj@intel.com>
>> Link: https://lkml.kernel.org/r/3aab341725ee6a9aafd3141387453b45d788d61a.1515542293.git.tim.c.chen@linux.intel.com
>> Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
>> ---
>> arch/x86/entry/calling.h | 73 ++++++++++++++++++++++++++++++++++++++++++++++++
>> 1 file changed, 73 insertions(+)
>>
>> diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
>> index 3f48f69..5aafb51 100644
>> --- a/arch/x86/entry/calling.h
>> +++ b/arch/x86/entry/calling.h
>> @@ -6,6 +6,8 @@
>> #include <asm/percpu.h>
>> #include <asm/asm-offsets.h>
>> #include <asm/processor-flags.h>
>> +#include <asm/msr-index.h>
>> +#include <asm/cpufeatures.h>
>>
>> /*
>>
>> @@ -349,3 +351,74 @@ For 32-bit we have the following conventions - kernel is built with
>> .Lafter_call_\@:
>> #endif
>> .endm
>> +
>> +/*
>> + * IBRS related macros
>> + */
>> +.macro PUSH_MSR_REGS
>> +    pushq    %rax
>> +    pushq    %rcx
>> +    pushq    %rdx
>> +.endm
>> +
>> +.macro POP_MSR_REGS
>> +    popq    %rdx
>> +    popq    %rcx
>> +    popq    %rax
>> +.endm
>> +
>> +.macro WRMSR_ASM msr_nr:req edx_val:req eax_val:req
>> +    movl    \msr_nr, %ecx
>> +    movl    \edx_val, %edx
>> +    movl    \eax_val, %eax
>> +    wrmsr
>> +.endm
>> +
>> +.macro RESTRICT_IB_SPEC
>> +    ALTERNATIVE "jmp .Lskip_\@", "", X86_FEATURE_IBRS
>> +    PUSH_MSR_REGS
>> +    WRMSR_ASM $MSR_IA32_SPEC_CTRL, $0, $SPEC_CTRL_IBRS
>> +    POP_MSR_REGS
>> +.Lskip_\@:
>> +.endm
>> +
>> +.macro UNRESTRICT_IB_SPEC
>> +    ALTERNATIVE "jmp .Lskip_\@", "", X86_FEATURE_IBRS
>> +    PUSH_MSR_REGS
>> +    WRMSR_ASM $MSR_IA32_SPEC_CTRL, $0, $0
> 
> I think you should be writing 2, not 0, since I'm reasonably confident that we want STIBP on.  Can you explain why you're writing 0?

Do we want to talk about STIBP in general?  Should it be (yet another)
boot option to enable or disable?  If there is STIBP support without
IBRS support, it could be a set and forget at boot time.

Thanks,
Tom

> 
> Also, holy cow, there are so many macros here.
> 
> And a meta question: why are there so many submitters of the same series?
>
Woodhouse, David Jan. 23, 2018, 4:20 p.m. UTC | #9
On Tue, 2018-01-23 at 10:12 -0600, Tom Lendacky wrote:
> 
> >> +.macro UNRESTRICT_IB_SPEC
> >> +    ALTERNATIVE "jmp .Lskip_\@", "", X86_FEATURE_IBRS
> >> +    PUSH_MSR_REGS
> >> +    WRMSR_ASM $MSR_IA32_SPEC_CTRL, $0, $0
> > 
> I think you should be writing 2, not 0, since I'm reasonably
> confident that we want STIBP on.  Can you explain why you're writing
> 0?
> 
> Do we want to talk about STIBP in general?  Should it be (yet another)
> boot option to enable or disable?  If there is STIBP support without
> IBRS support, it could be a set and forget at boot time.

We haven't got patches which enable STIBP in general. The kernel itself
is safe either way with retpoline, or because IBRS implies STIBP too
(that is, there's no difference between writing 1 and 3).

So STIBP is purely about protecting userspace processes from one
another, and VM guests from one another, when they run on HT siblings.

There's an argument that there are so many other information leaks
between HT siblings that we might not care. Especially as it's hard to
*tell* when you're scheduling, whether you trust all the processes (or
guests) on your HT siblings right now... let alone later when
scheduling another process if you need to *now* set STIBP on a sibling
which is no longer save from this process now running.

I'm not sure we want to set STIBP *unconditionally* either because of
the performance implications.

For IBRS we had an answer and it was just ugly. For STIBP we don't
actually have an answer for "how do we use this?". Do we?
Pavel Machek Jan. 23, 2018, 8:16 p.m. UTC | #10
On Sun 2018-01-21 20:28:17, David Woodhouse wrote:
> On Sun, 2018-01-21 at 11:34 -0800, Linus Torvalds wrote:
> > All of this is pure garbage.
> > 
> > Is Intel really planning on making this shit architectural? Has
> > anybody talked to them and told them they are f*cking insane?
> > 
> > Please, any Intel engineers here - talk to your managers. 
> 
> If the alternative was a two-decade product recall and giving everyone
> free CPUs, I'm not sure it was entirely insane.
> 
> Certainly it's a nasty hack, but hey — the world was on fire and in the
> end we didn't have to just turn the datacentres off and go back to goat
> farming, so it's not all bad.

Well, someone at Intel put world on fire. And then was selling faulty
CPUs for half a year while world was on fire; they knew they are
faulty yet they sold them anyway.

Then Intel talks about how great they are and how security is
important for them.... Intentionaly confusing between Meltdown and
Spectre so they can mask how badly they screwed. And without apologies.

> As a hack for existing CPUs, it's just about tolerable — as long as it
> can die entirely by the next generation.
> 
> So the part is I think is odd is the IBRS_ALL feature, where a future
> CPU will advertise "I am able to be not broken" and then you have to
> set the IBRS bit once at boot time to *ask* it not to be broken. That
> part is weird, because it ought to have been treated like the RDCL_NO
> bit — just "you don't have to worry any more, it got better".

And now Intel wants to cheat at benchmarks, to put companies that do
right thing at disadvantage and thinks that that's okay because world
was on fire?

At this point, I believe that yes, product recall would be
appropriate. If Intel is not willing to do it on their own, well,
perhaps courts can force them. Ouch and I wound not mind some jail time
for whoever is responsible for selling known-faulty CPUs to the public.

Oh, and still no word about the real fixes. World is not only Linux,
you see? https://pavelmachek.livejournal.com/140949.html?nojs=1

Best regards,
									Pavel
Tom Lendacky Jan. 23, 2018, 10:37 p.m. UTC | #11
On 1/23/2018 10:20 AM, Woodhouse, David wrote:
> On Tue, 2018-01-23 at 10:12 -0600, Tom Lendacky wrote:
>>
>>>> +.macro UNRESTRICT_IB_SPEC
>>>> +    ALTERNATIVE "jmp .Lskip_\@", "", X86_FEATURE_IBRS
>>>> +    PUSH_MSR_REGS
>>>> +    WRMSR_ASM $MSR_IA32_SPEC_CTRL, $0, $0
>>>  
>> I think you should be writing 2, not 0, since I'm reasonably
>> confident that we want STIBP on.  Can you explain why you're writing
>> 0?
>>
>> Do we want to talk about STIBP in general?  Should it be (yet another)
>> boot option to enable or disable?  If there is STIBP support without
>> IBRS support, it could be a set and forget at boot time.
> 
> We haven't got patches which enable STIBP in general. The kernel itself
> is safe either way with retpoline, or because IBRS implies STIBP too
> (that is, there's no difference between writing 1 and 3).
> 
> So STIBP is purely about protecting userspace processes from one
> another, and VM guests from one another, when they run on HT siblings.
> 
> There's an argument that there are so many other information leaks
> between HT siblings that we might not care. Especially as it's hard to
> *tell* when you're scheduling, whether you trust all the processes (or
> guests) on your HT siblings right now... let alone later when
> scheduling another process if you need to *now* set STIBP on a sibling
> which is no longer save from this process now running.
> 
> I'm not sure we want to set STIBP *unconditionally* either because of
> the performance implications.
> 
> For IBRS we had an answer and it was just ugly. For STIBP we don't
> actually have an answer for "how do we use this?". Do we?

Not sure.  Maybe to start, the answer might be to allow it to be set for
the ultra-paranoid, but in general don't enable it by default.  Having it
enabled would be an alternative to someone deciding to disable SMT, since
that would have even more of a performance impact.

Thanks,
Tom

> 
>
Andi Kleen Jan. 23, 2018, 10:49 p.m. UTC | #12
> Not sure.  Maybe to start, the answer might be to allow it to be set for
> the ultra-paranoid, but in general don't enable it by default.  Having it
> enabled would be an alternative to someone deciding to disable SMT, since
> that would have even more of a performance impact.

I agree. A reasonable strategy would be to only enable it for
processes that have dumpable disabled. This should be already set for
high value processes like GPG, and allows others to opt-in if
they need to.

-Andi
Woodhouse, David Jan. 23, 2018, 11:14 p.m. UTC | #13
On Tue, 2018-01-23 at 14:49 -0800, Andi Kleen wrote:
> > Not sure.  Maybe to start, the answer might be to allow it to be set for
> > the ultra-paranoid, but in general don't enable it by default.  Having it
> > enabled would be an alternative to someone deciding to disable SMT, since
> > that would have even more of a performance impact.
> 
> I agree. A reasonable strategy would be to only enable it for
> processes that have dumpable disabled. This should be already set for
> high value processes like GPG, and allows others to opt-in if
> they need to.

That seems to make sense, and I think was the solution we were
approaching for IBPB on context switch too, right?

Are we generally agreed on dumpable as the criterion for both of those?
Andi Kleen Jan. 23, 2018, 11:22 p.m. UTC | #14
On Tue, Jan 23, 2018 at 11:14:36PM +0000, Woodhouse, David wrote:
> On Tue, 2018-01-23 at 14:49 -0800, Andi Kleen wrote:
> > > Not sure.  Maybe to start, the answer might be to allow it to be set for
> > > the ultra-paranoid, but in general don't enable it by default.  Having it
> > > enabled would be an alternative to someone deciding to disable SMT, since
> > > that would have even more of a performance impact.
> > 
> > I agree. A reasonable strategy would be to only enable it for
> > processes that have dumpable disabled. This should be already set for
> > high value processes like GPG, and allows others to opt-in if
> > they need to.
> 
> That seems to make sense, and I think was the solution we were
> approaching for IBPB on context switch too, right?

Right.

-Andi
Tim Chen Jan. 24, 2018, 12:47 a.m. UTC | #15
On 01/23/2018 03:14 PM, Woodhouse, David wrote:
> On Tue, 2018-01-23 at 14:49 -0800, Andi Kleen wrote:
>>> Not sure.  Maybe to start, the answer might be to allow it to be set for
>>> the ultra-paranoid, but in general don't enable it by default.  Having it
>>> enabled would be an alternative to someone deciding to disable SMT, since
>>> that would have even more of a performance impact.
>>
>> I agree. A reasonable strategy would be to only enable it for
>> processes that have dumpable disabled. This should be already set for
>> high value processes like GPG, and allows others to opt-in if
>> they need to.
> 
> That seems to make sense, and I think was the solution we were
> approaching for IBPB on context switch too, right?
> 
> Are we generally agreed on dumpable as the criterion for both of those?
> 

It is a reasonable approach.  Let a process who needs max security
opt in with disabled dumpable. It can have a flush with IBPB clear before
starting to run, and have STIBP set while running.

Tim
Andy Lutomirski Jan. 24, 2018, 1 a.m. UTC | #16
On Tue, Jan 23, 2018 at 4:47 PM, Tim Chen <tim.c.chen@linux.intel.com> wrote:
> On 01/23/2018 03:14 PM, Woodhouse, David wrote:
>> On Tue, 2018-01-23 at 14:49 -0800, Andi Kleen wrote:
>>>> Not sure.  Maybe to start, the answer might be to allow it to be set for
>>>> the ultra-paranoid, but in general don't enable it by default.  Having it
>>>> enabled would be an alternative to someone deciding to disable SMT, since
>>>> that would have even more of a performance impact.
>>>
>>> I agree. A reasonable strategy would be to only enable it for
>>> processes that have dumpable disabled. This should be already set for
>>> high value processes like GPG, and allows others to opt-in if
>>> they need to.
>>
>> That seems to make sense, and I think was the solution we were
>> approaching for IBPB on context switch too, right?
>>
>> Are we generally agreed on dumpable as the criterion for both of those?
>>
>
> It is a reasonable approach.  Let a process who needs max security
> opt in with disabled dumpable. It can have a flush with IBPB clear before
> starting to run, and have STIBP set while running.
>

Do we maybe want a separate opt in?  I can easily imagine things like
web browsers that *don't* want to be non-dumpable but do want this
opt-in.

Also, what's the performance hit of STIBP?
David Woodhouse Jan. 24, 2018, 1:22 a.m. UTC | #17
On Tue, 2018-01-23 at 17:00 -0800, Andy Lutomirski wrote:
> On Tue, Jan 23, 2018 at 4:47 PM, Tim Chen <tim.c.chen@linux.intel.com> wrote:
> > 
> > On 01/23/2018 03:14 PM, Woodhouse, David wrote:
> > > 
> > > On Tue, 2018-01-23 at 14:49 -0800, Andi Kleen wrote:
> > > > 
> > > > > 
> > > > > Not sure.  Maybe to start, the answer might be to allow it to be set for
> > > > > the ultra-paranoid, but in general don't enable it by default.  Having it
> > > > > enabled would be an alternative to someone deciding to disable SMT, since
> > > > > that would have even more of a performance impact.
> > > > I agree. A reasonable strategy would be to only enable it for
> > > > processes that have dumpable disabled. This should be already set for
> > > > high value processes like GPG, and allows others to opt-in if
> > > > they need to.
> > > That seems to make sense, and I think was the solution we were
> > > approaching for IBPB on context switch too, right?
> > > 
> > > Are we generally agreed on dumpable as the criterion for both of those?
> > > 
> > It is a reasonable approach.  Let a process who needs max security
> > opt in with disabled dumpable. It can have a flush with IBPB clear before
> > starting to run, and have STIBP set while running.
> > 
> Do we maybe want a separate opt in?  I can easily imagine things like
> web browsers that *don't* want to be non-dumpable but do want this
> opt-in.
 
This is to protect you from another local process running on a HT
sibling. Not the kind of thing that web browsers are normally worrying
about.

> Also, what's the performance hit of STIBP?

Varies per CPU generation, but generally approaching that of full IBRS
I think? I don't recall looking at this specifically (since we haven't
actually used it for this yet).
Van De Ven, Arjan Jan. 24, 2018, 1:59 a.m. UTC | #18
> > It is a reasonable approach.  Let a process who needs max security

> > opt in with disabled dumpable. It can have a flush with IBPB clear before

> > starting to run, and have STIBP set while running.

> >

> 

> Do we maybe want a separate opt in?  I can easily imagine things like

> web browsers that *don't* want to be non-dumpable but do want this

> opt-in.


eventually we need something better. Probably in addition.
dumpable is used today for things that want this.

> 

> Also, what's the performance hit of STIBP?


pretty steep, but it depends on the CPU generation, for some it's cheaper than others. (yes I realize this is a vague answer, but the range is really from just about zero to oh my god)

I'm not a fan of doing this right now to be honest. We really need to not piece meal some of this, and come up with a better concept of protection on a higher level.
For example, you mention web browsers, but the threat model for browsers is generally internet content. For V2 to work you need to get some "evil pointer" into the app from the observer and browsers usually aren't doing that.
The most likely user would be some software-TPM-like service that has magic keys.

And for keys we want something else... we want an madvice() sort of thing that does a few things, like equivalent of mlock (so the key does not end up in swap), not having the page (but potentially the rest) end up in core dumps, and the kernel making sure that if the program exits (say for segv) that the key page gets zeroed before going into the free pool. Once you do that as feature, making the key speculation safe is not too hard (intel and arm have cpu options to mark pages for that)
Andy Lutomirski Jan. 24, 2018, 3:25 a.m. UTC | #19
> On Jan 23, 2018, at 5:59 PM, Van De Ven, Arjan <arjan.van.de.ven@intel.com> wrote:
> 
> 
>>> It is a reasonable approach.  Let a process who needs max security
>>> opt in with disabled dumpable. It can have a flush with IBPB clear before
>>> starting to run, and have STIBP set while running.
>>> 
>> 
>> Do we maybe want a separate opt in?  I can easily imagine things like
>> web browsers that *don't* want to be non-dumpable but do want this
>> opt-in.
> 
> eventually we need something better. Probably in addition.
> dumpable is used today for things that want this.
> 
>> 
>> Also, what's the performance hit of STIBP?
> 
> pretty steep, but it depends on the CPU generation, for some it's cheaper than others. (yes I realize this is a vague answer, but the range is really from just about zero to oh my god)
> 
> I'm not a fan of doing this right now to be honest. We really need to not piece meal some of this, and come up with a better concept of protection on a higher level.
> For example, you mention web browsers, but the threat model for browsers is generally internet content. For V2 to work you need to get some "evil pointer" into the app from the observer and browsers usually aren't doing that.
> The most likely user would be some software-TPM-like service that has magic keys.
> 
> And for keys we want something else... we want an madvice() sort of thing that does a few things, like equivalent of mlock (so the key does not end up in swap),

I'd love to see a slight variant: encrypt that page against some ephemeral key if it gets swapped.

> not having the page (but potentially the rest) end up in core dumps, and the kernel making sure that if the program exits (say for segv) that the key page gets zeroed before going into the free pool. Once you do that as feature, making the key speculation safe is not too hard (intel and arm have cpu options to mark pages for that)
> 
> 

How do we do that on Intel?  Make it UC?
diff mbox

Patch

diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 3f48f69..5aafb51 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -6,6 +6,8 @@ 
 #include <asm/percpu.h>
 #include <asm/asm-offsets.h>
 #include <asm/processor-flags.h>
+#include <asm/msr-index.h>
+#include <asm/cpufeatures.h>
 
 /*
 
@@ -349,3 +351,74 @@  For 32-bit we have the following conventions - kernel is built with
 .Lafter_call_\@:
 #endif
 .endm
+
+/*
+ * IBRS related macros
+ */
+.macro PUSH_MSR_REGS
+	pushq	%rax
+	pushq	%rcx
+	pushq	%rdx
+.endm
+
+.macro POP_MSR_REGS
+	popq	%rdx
+	popq	%rcx
+	popq	%rax
+.endm
+
+.macro WRMSR_ASM msr_nr:req edx_val:req eax_val:req
+	movl	\msr_nr, %ecx
+	movl	\edx_val, %edx
+	movl	\eax_val, %eax
+	wrmsr
+.endm
+
+.macro RESTRICT_IB_SPEC
+	ALTERNATIVE "jmp .Lskip_\@", "", X86_FEATURE_IBRS
+	PUSH_MSR_REGS
+	WRMSR_ASM $MSR_IA32_SPEC_CTRL, $0, $SPEC_CTRL_IBRS
+	POP_MSR_REGS
+.Lskip_\@:
+.endm
+
+.macro UNRESTRICT_IB_SPEC
+	ALTERNATIVE "jmp .Lskip_\@", "", X86_FEATURE_IBRS
+	PUSH_MSR_REGS
+	WRMSR_ASM $MSR_IA32_SPEC_CTRL, $0, $0
+	POP_MSR_REGS
+.Lskip_\@:
+.endm
+
+.macro RESTRICT_IB_SPEC_CLOBBER
+	ALTERNATIVE "jmp .Lskip_\@", "", X86_FEATURE_IBRS
+	WRMSR_ASM $MSR_IA32_SPEC_CTRL, $0, $SPEC_CTRL_IBRS
+.Lskip_\@:
+.endm
+
+.macro UNRESTRICT_IB_SPEC_CLOBBER
+	ALTERNATIVE "jmp .Lskip_\@", "", X86_FEATURE_IBRS
+	WRMSR_ASM $MSR_IA32_SPEC_CTRL, $0, $0
+.Lskip_\@:
+.endm
+
+.macro RESTRICT_IB_SPEC_SAVE_AND_CLOBBER save_reg:req
+	ALTERNATIVE "jmp .Lskip_\@", "", X86_FEATURE_IBRS
+	movl	$MSR_IA32_SPEC_CTRL, %ecx
+	rdmsr
+	movl	%eax, \save_reg
+	movl	$0, %edx
+	movl	$SPEC_CTRL_IBRS, %eax
+	wrmsr
+.Lskip_\@:
+.endm
+
+.macro RESTORE_IB_SPEC_CLOBBER save_reg:req
+	ALTERNATIVE "jmp .Lskip_\@", "", X86_FEATURE_IBRS
+	/* Set IBRS to the value saved in the save_reg */
+	movl    $MSR_IA32_SPEC_CTRL, %ecx
+	movl    $0, %edx
+	movl    \save_reg, %eax
+	wrmsr
+.Lskip_\@:
+.endm