diff mbox

[v3,8/9] x86: use __uaccess_begin_nospec and ASM_IFENCE in get_user paths

Message ID 151586748981.5820.14559543798744763404.stgit@dwillia2-desk3.amr.corp.intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Dan Williams Jan. 13, 2018, 6:18 p.m. UTC
Quoting Linus:

    I do think that it would be a good idea to very expressly document
    the fact that it's not that the user access itself is unsafe. I do
    agree that things like "get_user()" want to be protected, but not
    because of any direct bugs or problems with get_user() and friends,
    but simply because get_user() is an excellent source of a pointer
    that is obviously controlled from a potentially attacking user
    space. So it's a prime candidate for then finding _subsequent_
    accesses that can then be used to perturb the cache.

Note that '__copy_user_ll' is also called in the 'put_user' case, but
there is currently no indication that put_user in general deserves the
same hygiene as 'get_user'.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Suggested-by: Andi Kleen <ak@linux.intel.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Kees Cook <keescook@chromium.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: x86@kernel.org
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 arch/x86/include/asm/uaccess.h    |    6 +++---
 arch/x86/include/asm/uaccess_32.h |    6 +++---
 arch/x86/include/asm/uaccess_64.h |   12 ++++++------
 arch/x86/lib/copy_user_64.S       |    3 +++
 arch/x86/lib/getuser.S            |    5 +++++
 arch/x86/lib/usercopy_32.c        |    8 ++++----
 6 files changed, 24 insertions(+), 16 deletions(-)

Comments

Linus Torvalds Jan. 13, 2018, 7:05 p.m. UTC | #1
On Sat, Jan 13, 2018 at 10:18 AM, Dan Williams <dan.j.williams@intel.com> wrote:
> diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S
> index c97d935a29e8..85f400b8ee7c 100644
> --- a/arch/x86/lib/getuser.S
> +++ b/arch/x86/lib/getuser.S
> @@ -41,6 +41,7 @@ ENTRY(__get_user_1)
>         cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX
>         jae bad_get_user
>         ASM_STAC
> +       ASM_IFENCE
>  1:     movzbl (%_ASM_AX),%edx
>         xor %eax,%eax
>         ASM_CLAC

So I really would like to know from somebody (preferably somebody with
real microarchitectural knowledge) just how expensive that "lfence"
ends up being.

Because since we could just generate the masking of the address from
the exact same condition code that we already generate, the "lfence"
really can be replaced by just two ALU instructions instead:

   diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S
   index c97d935a29e8..4c378b485399 100644
   --- a/arch/x86/lib/getuser.S
   +++ b/arch/x86/lib/getuser.S
   @@ -40,6 +40,8 @@ ENTRY(__get_user_1)
           mov PER_CPU_VAR(current_task), %_ASM_DX
           cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX
           jae bad_get_user
   +       sbb %_ASM_DX,%_ASM_DX
   +       and %_ASM_DX,%_ASM_AX
           ASM_STAC
    1:     movzbl (%_ASM_AX),%edx
           xor %eax,%eax

which looks like it should have a fairly low maximum overhead (ok, the
above is totally untested, maybe I got the condition the wrong way
around _again_).

I _know_ that lfence is expensive as hell on P4, for example.

Yes, yes, "sbb" is often more expensive than most ALU instructions,
and Agner Fog says it has a 10-cycle latency on Prescott (which is
outrageous, but being one or two cycles more due to the flags
generation is normal). So the sbb/and may certainly add a few cycles
to the critical path, but on Prescott "lfence" is *50* cycles
according to those same tables by Agner Fog.

Is there anybody who is willing to say one way or another wrt the
"sbb/and" sequence vs "lfence".

                       Linus
Linus Torvalds Jan. 13, 2018, 7:33 p.m. UTC | #2
On Sat, Jan 13, 2018 at 11:05 AM, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> I _know_ that lfence is expensive as hell on P4, for example.
>
> Yes, yes, "sbb" is often more expensive than most ALU instructions,
> and Agner Fog says it has a 10-cycle latency on Prescott (which is
> outrageous, but being one or two cycles more due to the flags
> generation is normal). So the sbb/and may certainly add a few cycles
> to the critical path, but on Prescott "lfence" is *50* cycles
> according to those same tables by Agner Fog.

Side note: I don't think P4 is really relevant for a performance
discussion, I was just giving it as an example where we do know actual
cycles.

I'm much more interested in modern Intel big-core CPU's, and just
wondering whether somebody could ask an architect.

Because I _suspect_ the answer from a CPU architect would be: "Christ,
the sbb/and sequence is much better because it doesn't have any extra
serialization", but maybe I'm wrong, and people feel that lfence is
particularly easy to do right without any real downside.

                Linus
Eric W. Biederman Jan. 13, 2018, 8:22 p.m. UTC | #3
Linus Torvalds <torvalds@linux-foundation.org> writes:

> On Sat, Jan 13, 2018 at 11:05 AM, Linus Torvalds
> <torvalds@linux-foundation.org> wrote:
>>
>> I _know_ that lfence is expensive as hell on P4, for example.
>>
>> Yes, yes, "sbb" is often more expensive than most ALU instructions,
>> and Agner Fog says it has a 10-cycle latency on Prescott (which is
>> outrageous, but being one or two cycles more due to the flags
>> generation is normal). So the sbb/and may certainly add a few cycles
>> to the critical path, but on Prescott "lfence" is *50* cycles
>> according to those same tables by Agner Fog.
>
> Side note: I don't think P4 is really relevant for a performance
> discussion, I was just giving it as an example where we do know actual
> cycles.
>
> I'm much more interested in modern Intel big-core CPU's, and just
> wondering whether somebody could ask an architect.
>
> Because I _suspect_ the answer from a CPU architect would be: "Christ,
> the sbb/and sequence is much better because it doesn't have any extra
> serialization", but maybe I'm wrong, and people feel that lfence is
> particularly easy to do right without any real downside.

As an educated observer it seems like the cmpq/sbb/and sequence is an
improvement because it moves the dependency from one end of the cpu
pipeline to another.  If any cpu does data speculation on anything other
than branch targets that sequence could still be susceptible to
speculation.

From the AMD patches it appears that lfence is becoming a serializing
instruction which in principal is much more expensive.

Also do we have alternatives for these sequences so if we run on an
in-order atom (or 386 or 486) where speculation does not occur we can
avoid the cost?

Eric
Dan Williams Jan. 16, 2018, 10:23 p.m. UTC | #4
On Sat, Jan 13, 2018 at 11:33 AM, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
> On Sat, Jan 13, 2018 at 11:05 AM, Linus Torvalds
> <torvalds@linux-foundation.org> wrote:
>>
>> I _know_ that lfence is expensive as hell on P4, for example.
>>
>> Yes, yes, "sbb" is often more expensive than most ALU instructions,
>> and Agner Fog says it has a 10-cycle latency on Prescott (which is
>> outrageous, but being one or two cycles more due to the flags
>> generation is normal). So the sbb/and may certainly add a few cycles
>> to the critical path, but on Prescott "lfence" is *50* cycles
>> according to those same tables by Agner Fog.
>
> Side note: I don't think P4 is really relevant for a performance
> discussion, I was just giving it as an example where we do know actual
> cycles.
>
> I'm much more interested in modern Intel big-core CPU's, and just
> wondering whether somebody could ask an architect.
>
> Because I _suspect_ the answer from a CPU architect would be: "Christ,
> the sbb/and sequence is much better because it doesn't have any extra
> serialization", but maybe I'm wrong, and people feel that lfence is
> particularly easy to do right without any real downside.
>

From the last paragraph of this guidance:

https://software.intel.com/sites/default/files/managed/c5/63/336996-Speculative-Execution-Side-Channel-Mitigations.pdf

...I read that as Linux can constrain speculation with 'and; sbb'
instead of 'lfence', and any future changes will be handled like any
new cpu enabling.

To your specific question of the relative costs, sbb is
architecturally cheaper, so let's go with that approach.

For this '__uaccess_begin_nospec' patch set, at a minimum the kernel
needs a helper that can be easily grep'd when/if it needs changing in
a future kernel. It also indicates that the command line approach to
dynamically switch the mitigation mechanism is over-engineering.

That said, for get_user specifically, can we do something even
cheaper. Dave H. reminds me that any valid user pointer that gets past
the address limit check will have the high bit clear. So instead of
calculating a mask, just unconditionally clear the high bit. It seems
worse case userspace can speculatively leak something that's already
in its address space.

I'll respin this set along those lines, and drop the ifence bits.
Linus Torvalds Jan. 16, 2018, 10:41 p.m. UTC | #5
On Jan 16, 2018 14:23, "Dan Williams" <dan.j.williams@intel.com> wrote:


That said, for get_user specifically, can we do something even
cheaper. Dave H. reminds me that any valid user pointer that gets past
the address limit check will have the high bit clear. So instead of
calculating a mask, just unconditionally clear the high bit. It seems
worse case userspace can speculatively leak something that's already
in its address space.


That's not at all true.

The address may be a kernel address. That's the whole point of 'set_fs()'.

That's why we compare against the address limit variable, not against some
constant number.

     Linus
Dan Williams Jan. 17, 2018, 4:30 a.m. UTC | #6
On Tue, Jan 16, 2018 at 2:23 PM, Dan Williams <dan.j.williams@intel.com> wrote:
> On Sat, Jan 13, 2018 at 11:33 AM, Linus Torvalds
[..]
> I'll respin this set along those lines, and drop the ifence bits.

So now I'm not so sure. Yes, get_user_{1,2,4,8} can mask the pointer
with the address limit result, but this doesn't work for the
access_ok() + __get_user() case. We can either change the access_ok()
calling convention to return a properly masked pointer to be used in
subsequent calls to __get_user(), or go with lfence on every
__get_user call. There seem to be several drivers that open code
copy_from_user() with __get_user loops, so the 'fence every
__get_user' approach might have noticeable overhead. On the other hand
the access_ok conversion, while it could be scripted with coccinelle,
is ~300 sites (VERIFY_READ), if you're concerned about having
something small to merge for 4.15.

I think the access_ok() conversion to return a speculation sanitized
pointer or NULL is the way to go unless I'm missing something simpler.
Other ideas?
Al Viro Jan. 17, 2018, 6:28 a.m. UTC | #7
On Tue, Jan 16, 2018 at 08:30:17PM -0800, Dan Williams wrote:
> On Tue, Jan 16, 2018 at 2:23 PM, Dan Williams <dan.j.williams@intel.com> wrote:
> > On Sat, Jan 13, 2018 at 11:33 AM, Linus Torvalds
> [..]
> > I'll respin this set along those lines, and drop the ifence bits.
> 
> So now I'm not so sure. Yes, get_user_{1,2,4,8} can mask the pointer
> with the address limit result, but this doesn't work for the
> access_ok() + __get_user() case. We can either change the access_ok()
> calling convention to return a properly masked pointer to be used in
> subsequent calls to __get_user(), or go with lfence on every
> __get_user call. There seem to be several drivers that open code
> copy_from_user() with __get_user loops, so the 'fence every
> __get_user' approach might have noticeable overhead. On the other hand
> the access_ok conversion, while it could be scripted with coccinelle,
> is ~300 sites (VERIFY_READ), if you're concerned about having
> something small to merge for 4.15.
> 
> I think the access_ok() conversion to return a speculation sanitized
> pointer or NULL is the way to go unless I'm missing something simpler.
> Other ideas?

What masked pointer?  access_ok() exists for other architectures as well,
and the fewer callers remain outside of arch/*, the better.

Anything that open-codes copy_from_user() that way is *ALREADY* fucked if
it cares about the overhead - recent x86 boxen will have slowdown from
hell on stac()/clac() pairs.  Anything like that on a hot path is already
deep in trouble and needs to be found and fixed.  What drivers would those
be?  We don't have that many __get_user() users left outside of arch/*
anymore...
Dan Williams Jan. 17, 2018, 6:50 a.m. UTC | #8
On Tue, Jan 16, 2018 at 10:28 PM, Al Viro <viro@zeniv.linux.org.uk> wrote:
> On Tue, Jan 16, 2018 at 08:30:17PM -0800, Dan Williams wrote:
>> On Tue, Jan 16, 2018 at 2:23 PM, Dan Williams <dan.j.williams@intel.com> wrote:
>> > On Sat, Jan 13, 2018 at 11:33 AM, Linus Torvalds
>> [..]
>> > I'll respin this set along those lines, and drop the ifence bits.
>>
>> So now I'm not so sure. Yes, get_user_{1,2,4,8} can mask the pointer
>> with the address limit result, but this doesn't work for the
>> access_ok() + __get_user() case. We can either change the access_ok()
>> calling convention to return a properly masked pointer to be used in
>> subsequent calls to __get_user(), or go with lfence on every
>> __get_user call. There seem to be several drivers that open code
>> copy_from_user() with __get_user loops, so the 'fence every
>> __get_user' approach might have noticeable overhead. On the other hand
>> the access_ok conversion, while it could be scripted with coccinelle,
>> is ~300 sites (VERIFY_READ), if you're concerned about having
>> something small to merge for 4.15.
>>
>> I think the access_ok() conversion to return a speculation sanitized
>> pointer or NULL is the way to go unless I'm missing something simpler.
>> Other ideas?
>
> What masked pointer?

The pointer value that is masked under speculation.

   diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S
   index c97d935a29e8..4c378b485399 100644
   --- a/arch/x86/lib/getuser.S
   +++ b/arch/x86/lib/getuser.S
   @@ -40,6 +40,8 @@ ENTRY(__get_user_1)
           mov PER_CPU_VAR(current_task), %_ASM_DX
           cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX
           jae bad_get_user
   +       sbb %_ASM_DX,%_ASM_DX
   +       and %_ASM_DX,%_ASM_AX
           ASM_STAC
    1:     movzbl (%_ASM_AX),%edx
           xor %eax,%eax

...i.e %_ASM_AX is guaranteed to be zero if userspace tries to cause
speculation with an address above the limit. The proposal is make
access_ok do that same masking so we never speculate on pointers from
userspace aimed at kernel memory.

> access_ok() exists for other architectures as well,

I'd modify those as well...

> and the fewer callers remain outside of arch/*, the better.
>
> Anything that open-codes copy_from_user() that way is *ALREADY* fucked if
> it cares about the overhead - recent x86 boxen will have slowdown from
> hell on stac()/clac() pairs.  Anything like that on a hot path is already
> deep in trouble and needs to be found and fixed.  What drivers would those
> be?

So I took a closer look and the pattern is not copy_from_user it's
more like __get_user + write-to-hardware loops. If the performance is
already expected to be bad for those then perhaps an lfence each loop
iteration won't be much worse. It's still a waste because the lfence
is only needed once after the access_ok.

> We don't have that many __get_user() users left outside of arch/*
> anymore...
David Laight Jan. 17, 2018, 10:07 a.m. UTC | #9
From: Dan Williams

> Sent: 17 January 2018 06:50

...
> > Anything that open-codes copy_from_user() that way is *ALREADY* fucked if

> > it cares about the overhead - recent x86 boxen will have slowdown from

> > hell on stac()/clac() pairs.  Anything like that on a hot path is already

> > deep in trouble and needs to be found and fixed.  What drivers would those

> > be?

> 

> So I took a closer look and the pattern is not copy_from_user it's

> more like __get_user + write-to-hardware loops. If the performance is

> already expected to be bad for those then perhaps an lfence each loop

> iteration won't be much worse. It's still a waste because the lfence

> is only needed once after the access_ok.


Performance of PCIe writes isn't that back (since they are posted)
adding a synchronising instructions to __get_user() could easily
be noticeable.

Unfortunately you can't use copy_from_user() with a PCIe mapped
target memory address (or even memcpy_to_io()) because on x86
the copy is aliased to memcpy() and that uses 'rep movsb'
which has to do single byte transfers because the address is
uncached.
(This is really bad for PCIe reads.)

	David
Alan Cox Jan. 17, 2018, 2:17 p.m. UTC | #10
On Tue, 2018-01-16 at 14:41 -0800, Linus Torvalds wrote:
> 
> 
> On Jan 16, 2018 14:23, "Dan Williams" <dan.j.williams@intel.com>
> wrote:
> > That said, for get_user specifically, can we do something even
> > cheaper. Dave H. reminds me that any valid user pointer that gets
> > past
> > the address limit check will have the high bit clear. So instead of
> > calculating a mask, just unconditionally clear the high bit. It
> > seems
> > worse case userspace can speculatively leak something that's
> > already
> > in its address space.
> 
> That's not at all true.
> 
> The address may be a kernel address. That's the whole point of
> 'set_fs()'.

Can we kill off the remaining users of set_fs() ?

Alan
Dan Williams Jan. 17, 2018, 6:12 p.m. UTC | #11
On Tue, Jan 16, 2018 at 10:50 PM, Dan Williams <dan.j.williams@intel.com> wrote:
> On Tue, Jan 16, 2018 at 10:28 PM, Al Viro <viro@zeniv.linux.org.uk> wrote:
[..]
>> Anything that open-codes copy_from_user() that way is *ALREADY* fucked if
>> it cares about the overhead - recent x86 boxen will have slowdown from
>> hell on stac()/clac() pairs.  Anything like that on a hot path is already
>> deep in trouble and needs to be found and fixed.  What drivers would those
>> be?
>
> So I took a closer look and the pattern is not copy_from_user it's
> more like __get_user + write-to-hardware loops. If the performance is
> already expected to be bad for those then perhaps an lfence each loop
> iteration won't be much worse. It's still a waste because the lfence
> is only needed once after the access_ok.
>
>> We don't have that many __get_user() users left outside of arch/*
>> anymore...

Given the concern of having something easy to backport first I think
we should start with lfence in __uaccess_begin(). Any deeper changes
to the access_ok() + __get_user calling convention can build on top of
that baseline.
Al Viro Jan. 17, 2018, 6:52 p.m. UTC | #12
On Wed, Jan 17, 2018 at 02:17:26PM +0000, Alan Cox wrote:
> On Tue, 2018-01-16 at 14:41 -0800, Linus Torvalds wrote:
> > 
> > 
> > On Jan 16, 2018 14:23, "Dan Williams" <dan.j.williams@intel.com>
> > wrote:
> > > That said, for get_user specifically, can we do something even
> > > cheaper. Dave H. reminds me that any valid user pointer that gets
> > > past
> > > the address limit check will have the high bit clear. So instead of
> > > calculating a mask, just unconditionally clear the high bit. It
> > > seems
> > > worse case userspace can speculatively leak something that's
> > > already
> > > in its address space.
> > 
> > That's not at all true.
> > 
> > The address may be a kernel address. That's the whole point of
> > 'set_fs()'.
> 
> Can we kill off the remaining users of set_fs() ?

Not easily.  They tend to come in pairs (the usual pattern is get_fs(),
save the result, set_fs(something), do work, set_fs(saved)), and
counting each such area as single instance we have (in my tree right
now) 121 locations.  Some could be killed (and will eventually be -
the number of set_fs()/access_ok()/__{get,put}_user()/__copy_...()
call sites had been seriously decreasing during the last couple of
years), but some are really hard to kill off.

How, for example, would you deal with this one:

/*
 * Receive a datagram from a UDP socket.
 */
static int svc_udp_recvfrom(struct svc_rqst *rqstp)
{
        struct svc_sock *svsk =
                container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
        struct svc_serv *serv = svsk->sk_xprt.xpt_server;
        struct sk_buff  *skb;
        union {
                struct cmsghdr  hdr;
                long            all[SVC_PKTINFO_SPACE / sizeof(long)];
        } buffer;
        struct cmsghdr *cmh = &buffer.hdr;
        struct msghdr msg = {
                .msg_name = svc_addr(rqstp),
                .msg_control = cmh,
                .msg_controllen = sizeof(buffer),
                .msg_flags = MSG_DONTWAIT,
        };
...
        err = kernel_recvmsg(svsk->sk_sock, &msg, NULL,
                             0, 0, MSG_PEEK | MSG_DONTWAIT);

With kernel_recvmsg() (and in my tree the above is its last surviving caller)
being

int kernel_recvmsg(struct socket *sock, struct msghdr *msg,
                   struct kvec *vec, size_t num, size_t size, int flags)
{
        mm_segment_t oldfs = get_fs();
        int result;

        iov_iter_kvec(&msg->msg_iter, READ | ITER_KVEC, vec, num, size);
        set_fs(KERNEL_DS);
        result = sock_recvmsg(sock, msg, flags);
        set_fs(oldfs);
        return result;
}
EXPORT_SYMBOL(kernel_recvmsg);

We are asking for recvmsg() with zero data length; what we really want is
->msg_control.  And _that_ is why we need that set_fs() - we want the damn
thing to go into local variable.

But note that filling ->msg_control will happen in put_cmsg(), called
from ip_cmsg_recv_pktinfo(), called from ip_cmsg_recv_offset(),
called from udp_recvmsg(), called from sock_recvmsg_nosec(), called
from sock_recvmsg().  Or in another path in case of IPv6.
Sure, we can arrange for propagation of that all way down those
call chains.  My preference would be to try and mark that (one and
only) case in ->msg_flags, so that put_cmsg() would be able to
check.  ___sys_recvmsg() sets that as
        msg_sys->msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT);
so we ought to be free to use any bit other than those two.  Since
put_cmsg() already checks ->msg_flags, that shouldn't put too much
overhead.  But then we'll need to do something to prevent speculative
execution straying down that way, won't we?  I'm not saying it can't
be done, but quite a few of the remaining call sites will take
serious work.

	Incidentally, what about copy_to_iter() and friends?  They
check iov_iter flavour and go either into the "copy to kernel buffer"
or "copy to userland" paths.  Do we need to deal with mispredictions
there?  We are calling a bunch of those on read()...
Linus Torvalds Jan. 17, 2018, 7:16 p.m. UTC | #13
On Tue, Jan 16, 2018 at 8:30 PM, Dan Williams <dan.j.williams@intel.com> wrote:
>
> I think the access_ok() conversion to return a speculation sanitized
> pointer or NULL is the way to go unless I'm missing something simpler.

No, that's way too big of a conversion.

Just make get_user() and friends (that currently use ASM_STAC) use the
address masking.

The people who use uaccess_begin() can use the lfence there.

Basically, the rule is trivial: find all 'stac' users, and use address
masking if those users already integrate the limit check, and lfence
they don't.

                Linus
Linus Torvalds Jan. 17, 2018, 7:26 p.m. UTC | #14
On Wed, Jan 17, 2018 at 6:17 AM, Alan Cox <alan@linux.intel.com> wrote:
>
> Can we kill off the remaining users of set_fs() ?

I would love to, but it's not going to happen short-term. If ever.

Some could be removed today: the code in arch/x86/net/bpf_jit_comp.c
seems to be literally the ramblings of a diseased mind. There's no
reason for the set_fs(), there's no reason for the
flush_icache_range() (it's a no-op on x86 anyway), and the smp_wmb()
looks bogus too.

I have no idea how that braindamage happened, but I assume it got
copied from some broken source.

But there are about ~100 set_fs() calls in generic code, and some of
those really are pretty fundamental. Doing things like "kernel_read()"
without set_fs() is basically impossible.

We've had set_fs() since the beginning. The naming is obviously very
historical. We have it for a very good reason, and I don't really see
that reason going away.

So realistically, we want to _minimize_ set_fs(), and we might want to
make sure that it's done only in limited settings (it might, for
example, be a good idea and a realistic goal to make sure that drivers
and modules can't do it, and use proper helper functions like that
"read_kernel()").

But getting rid of the concept entirely? Doesn't seem likely.

             Linus
Dan Williams Jan. 17, 2018, 7:54 p.m. UTC | #15
On Wed, Jan 17, 2018 at 10:52 AM, Al Viro <viro@zeniv.linux.org.uk> wrote:
> On Wed, Jan 17, 2018 at 02:17:26PM +0000, Alan Cox wrote:
[..]
>         Incidentally, what about copy_to_iter() and friends?  They
> check iov_iter flavour and go either into the "copy to kernel buffer"
> or "copy to userland" paths.  Do we need to deal with mispredictions
> there?  We are calling a bunch of those on read()...
>

Those should be protected by the conversion of __uaccess_begin to
__uaccess_begin_nospec that includes the lfence.
Eric Dumazet Jan. 17, 2018, 8:01 p.m. UTC | #16
On Wed, Jan 17, 2018 at 11:26 AM, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
> On Wed, Jan 17, 2018 at 6:17 AM, Alan Cox <alan@linux.intel.com> wrote:
>>
>> Can we kill off the remaining users of set_fs() ?
>
> I would love to, but it's not going to happen short-term. If ever.
>
> Some could be removed today: the code in arch/x86/net/bpf_jit_comp.c
> seems to be literally the ramblings of a diseased mind. There's no
> reason for the set_fs(), there's no reason for the
> flush_icache_range() (it's a no-op on x86 anyway), and the smp_wmb()
> looks bogus too.
>
> I have no idea how that braindamage happened, but I assume it got
> copied from some broken source.

At the time commit 0a14842f5a3c0e88a1e59fac5c3025db39721f74 went in,
this was the first JIT implementation for BPF, so maybe I wanted to avoid
other arches to forget to flush icache : You bet that my implementation served
as a reference for other JIT.

At that time, various calls to flush_icache_range() were definitely in arch/x86
or kernel/module.c

(I believe I must have copied the code from kernel/module.c, but that
I am not sure)

>
> But there are about ~100 set_fs() calls in generic code, and some of
> those really are pretty fundamental. Doing things like "kernel_read()"
> without set_fs() is basically impossible.
>
> We've had set_fs() since the beginning. The naming is obviously very
> historical. We have it for a very good reason, and I don't really see
> that reason going away.
>
> So realistically, we want to _minimize_ set_fs(), and we might want to
> make sure that it's done only in limited settings (it might, for
> example, be a good idea and a realistic goal to make sure that drivers
> and modules can't do it, and use proper helper functions like that
> "read_kernel()").
>
> But getting rid of the concept entirely? Doesn't seem likely.
>
>              Linus
Al Viro Jan. 17, 2018, 8:05 p.m. UTC | #17
On Wed, Jan 17, 2018 at 11:54:12AM -0800, Dan Williams wrote:
> On Wed, Jan 17, 2018 at 10:52 AM, Al Viro <viro@zeniv.linux.org.uk> wrote:
> > On Wed, Jan 17, 2018 at 02:17:26PM +0000, Alan Cox wrote:
> [..]
> >         Incidentally, what about copy_to_iter() and friends?  They
> > check iov_iter flavour and go either into the "copy to kernel buffer"
> > or "copy to userland" paths.  Do we need to deal with mispredictions
> > there?  We are calling a bunch of those on read()...
> >
> 
> Those should be protected by the conversion of __uaccess_begin to
> __uaccess_begin_nospec that includes the lfence.

Huh?  What the hell does it do to speculative execution of "memcpy those
suckers" branch?
Dan Williams Jan. 17, 2018, 8:14 p.m. UTC | #18
On Wed, Jan 17, 2018 at 12:05 PM, Al Viro <viro@zeniv.linux.org.uk> wrote:
> On Wed, Jan 17, 2018 at 11:54:12AM -0800, Dan Williams wrote:
>> On Wed, Jan 17, 2018 at 10:52 AM, Al Viro <viro@zeniv.linux.org.uk> wrote:
>> > On Wed, Jan 17, 2018 at 02:17:26PM +0000, Alan Cox wrote:
>> [..]
>> >         Incidentally, what about copy_to_iter() and friends?  They
>> > check iov_iter flavour and go either into the "copy to kernel buffer"
>> > or "copy to userland" paths.  Do we need to deal with mispredictions
>> > there?  We are calling a bunch of those on read()...
>> >
>>
>> Those should be protected by the conversion of __uaccess_begin to
>> __uaccess_begin_nospec that includes the lfence.
>
> Huh?  What the hell does it do to speculative execution of "memcpy those
> suckers" branch?

'raw_copy_from_user 'is changed to use 'uaccess_begin_nospec' instead
of plain 'uacess_begin'. The only difference between those being that
the former includes an lfence. So with this sequence.

        if (access_ok(VERIFY_READ, from, n)) {
                kasan_check_write(to, n);
                n = raw_copy_from_user(to, from, n);
        }
        return n;

...'from' is guaranteed to be within the address limit with respect to
speculative execution, or otherwise never de-referenced.
Christoph Hellwig Jan. 18, 2018, 4:38 p.m. UTC | #19
On Wed, Jan 17, 2018 at 11:26:08AM -0800, Linus Torvalds wrote:
> But there are about ~100 set_fs() calls in generic code, and some of
> those really are pretty fundamental. Doing things like "kernel_read()"
> without set_fs() is basically impossible.

Not if we move to iov_iter or iov_iter-like behavior for all reads
and writes.  There is an issue with how vectored writes are handles
in plain read/write vs read_iter/write_iter inherited from readv/writev,
but that's nothing a flag, or a second set of methods with the
same signature.

But there are more annoying things, most notable in-kernel ioctls
calls.  We have quite a few of them, and while many are just utterly
stupid and can be replaced with direct function calls or new methods
(I've done quite a few conversions of those) some might be left.
Something like iov_iter might be the answer again.

Then we have things like probe_kernel_read/probe_kernel_write which
abuse the exception handling in get/put user.  But with a little
arch helper we don't strictly need get_fs/set_fs for that.
Linus Torvalds Jan. 18, 2018, 4:49 p.m. UTC | #20
On Thu, Jan 18, 2018 at 8:38 AM, Christoph Hellwig <hch@infradead.org> wrote:
>
> > But there are about ~100 set_fs() calls in generic code, and some of
> > those really are pretty fundamental. Doing things like "kernel_read()"
> > without set_fs() is basically impossible.
>
> Not if we move to iov_iter or iov_iter-like behavior for all reads
> and writes.

Not going to happen. Really. We have how many tens of thousands of
drivers again, all doing "copy_to_user()".

And the fact is, set_fs() really isn't even a problem for this. Never
really has been.   From a security standpoint, it would actually be
*much* worse if we made those ten thousand places do "if (kernel_flag)
memcpy() else copy_to_user()".

We've had some issues with set_fs() being abused in interesting ways.
But "kernel_read()" and friends is not it.

               Linus
Al Viro Jan. 18, 2018, 6:12 p.m. UTC | #21
On Thu, Jan 18, 2018 at 08:49:31AM -0800, Linus Torvalds wrote:
> On Thu, Jan 18, 2018 at 8:38 AM, Christoph Hellwig <hch@infradead.org> wrote:
> >
> > > But there are about ~100 set_fs() calls in generic code, and some of
> > > those really are pretty fundamental. Doing things like "kernel_read()"
> > > without set_fs() is basically impossible.
> >
> > Not if we move to iov_iter or iov_iter-like behavior for all reads
> > and writes.
> 
> Not going to happen. Really. We have how many tens of thousands of
> drivers again, all doing "copy_to_user()".

The real PITA is not even that (we could provide helpers making
conversion from ->read() to ->read_iter() easy for char devices,
etc.).  It's the semantics of readv(2).  Consider e.g. readv()
from /dev/rtc, with iovec array consisting of 10 segments, each
int-sized.  Right now we'll get rtc_dev_read() called in a loop,
once for each segment.  Single read() into 40-byte buffer will
fill one long and bugger off.  Converting it to ->read_iter()
will mean more than just "use copy_to_iter() instead of put_user()" -
that would be trivial.  But to preserve the current behaviour
we would need something like
	total = 0;
	while (iov_iter_count(to)) {
		count = iov_iter_single_seg_count(to);
		/* current body of rtc_dev_read(), with
		 * put_user() replaced with copy_to_iter()
		 */
		....
		if (res < 0) {
			if (!total)
				total = res;
			break;
		}
		total += res;
		if (res != count)
			break;
	}
	return total;
in that thing.  And similar boilerplates would be needed in
a whole lot of drivers.  Sure, they are individually trivial,
but they would add up to shitloads of code to get wrong.

These are basically all ->read() instances that ignore *ppos
and, unlike pipes, do not attempt to fill as much of the
buffer as possible.  We do have quite a few of such.

Some ->read() instances can be easily converted to ->read_iter()
and will, in fact, be better off that way.  We had patches of
that sort and I'm certain that we still have such places left.
Ditto for ->write() and ->write_iter().  But those are not
even close to being the majority.  Sorry.

We could, in principle, do something like

dev_rtc_read_iter(iocb, to)
{
	return loop_read_iter(iocb, to, modified_dev_rtc_read);
}
with modified_dev_rtc_read() being the result of minimal
conversion (put_user() and copy_to_user() replaced with used
of copy_to_iter()).  It would be less boilerplate that way,
but I really don't see big benefits from doing that.

On the write side the things are just as unpleasant - we have
a lot of ->write() instances that parse the beginning of the
buffer, ignore the rest and report that everything got written.
writev() on those will parse each iovec segment, ignoring the
junk in the end of each.  Again, that loop needs to go somewhere.
And we do have a bunch of "parse the buffer and do some action
once" ->write() instances - in char devices, debugfs, etc.
diff mbox

Patch

diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index a31fd4fc6483..82c73f064e76 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -450,7 +450,7 @@  do {									\
 ({									\
 	int __gu_err;							\
 	__inttype(*(ptr)) __gu_val;					\
-	__uaccess_begin();						\
+	__uaccess_begin_nospec();					\
 	__get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT);	\
 	__uaccess_end();						\
 	(x) = (__force __typeof__(*(ptr)))__gu_val;			\
@@ -558,7 +558,7 @@  struct __large_struct { unsigned long buf[100]; };
  *	get_user_ex(...);
  * } get_user_catch(err)
  */
-#define get_user_try		uaccess_try
+#define get_user_try		uaccess_try_nospec
 #define get_user_catch(err)	uaccess_catch(err)
 
 #define get_user_ex(x, ptr)	do {					\
@@ -592,7 +592,7 @@  extern void __cmpxchg_wrong_size(void)
 	__typeof__(ptr) __uval = (uval);				\
 	__typeof__(*(ptr)) __old = (old);				\
 	__typeof__(*(ptr)) __new = (new);				\
-	__uaccess_begin();						\
+	__uaccess_begin_nospec();					\
 	switch (size) {							\
 	case 1:								\
 	{								\
diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index 72950401b223..ba2dc1930630 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -29,21 +29,21 @@  raw_copy_from_user(void *to, const void __user *from, unsigned long n)
 		switch (n) {
 		case 1:
 			ret = 0;
-			__uaccess_begin();
+			__uaccess_begin_nospec();
 			__get_user_asm_nozero(*(u8 *)to, from, ret,
 					      "b", "b", "=q", 1);
 			__uaccess_end();
 			return ret;
 		case 2:
 			ret = 0;
-			__uaccess_begin();
+			__uaccess_begin_nospec();
 			__get_user_asm_nozero(*(u16 *)to, from, ret,
 					      "w", "w", "=r", 2);
 			__uaccess_end();
 			return ret;
 		case 4:
 			ret = 0;
-			__uaccess_begin();
+			__uaccess_begin_nospec();
 			__get_user_asm_nozero(*(u32 *)to, from, ret,
 					      "l", "k", "=r", 4);
 			__uaccess_end();
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index f07ef3c575db..62546b3a398e 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -55,31 +55,31 @@  raw_copy_from_user(void *dst, const void __user *src, unsigned long size)
 		return copy_user_generic(dst, (__force void *)src, size);
 	switch (size) {
 	case 1:
-		__uaccess_begin();
+		__uaccess_begin_nospec();
 		__get_user_asm_nozero(*(u8 *)dst, (u8 __user *)src,
 			      ret, "b", "b", "=q", 1);
 		__uaccess_end();
 		return ret;
 	case 2:
-		__uaccess_begin();
+		__uaccess_begin_nospec();
 		__get_user_asm_nozero(*(u16 *)dst, (u16 __user *)src,
 			      ret, "w", "w", "=r", 2);
 		__uaccess_end();
 		return ret;
 	case 4:
-		__uaccess_begin();
+		__uaccess_begin_nospec();
 		__get_user_asm_nozero(*(u32 *)dst, (u32 __user *)src,
 			      ret, "l", "k", "=r", 4);
 		__uaccess_end();
 		return ret;
 	case 8:
-		__uaccess_begin();
+		__uaccess_begin_nospec();
 		__get_user_asm_nozero(*(u64 *)dst, (u64 __user *)src,
 			      ret, "q", "", "=r", 8);
 		__uaccess_end();
 		return ret;
 	case 10:
-		__uaccess_begin();
+		__uaccess_begin_nospec();
 		__get_user_asm_nozero(*(u64 *)dst, (u64 __user *)src,
 			       ret, "q", "", "=r", 10);
 		if (likely(!ret))
@@ -89,7 +89,7 @@  raw_copy_from_user(void *dst, const void __user *src, unsigned long size)
 		__uaccess_end();
 		return ret;
 	case 16:
-		__uaccess_begin();
+		__uaccess_begin_nospec();
 		__get_user_asm_nozero(*(u64 *)dst, (u64 __user *)src,
 			       ret, "q", "", "=r", 16);
 		if (likely(!ret))
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index 020f75cc8cf6..2429ca38dee6 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -31,6 +31,7 @@ 
  */
 ENTRY(copy_user_generic_unrolled)
 	ASM_STAC
+	ASM_IFENCE
 	cmpl $8,%edx
 	jb 20f		/* less then 8 bytes, go to byte copy loop */
 	ALIGN_DESTINATION
@@ -135,6 +136,7 @@  EXPORT_SYMBOL(copy_user_generic_unrolled)
  */
 ENTRY(copy_user_generic_string)
 	ASM_STAC
+	ASM_IFENCE
 	cmpl $8,%edx
 	jb 2f		/* less than 8 bytes, go to byte copy loop */
 	ALIGN_DESTINATION
@@ -175,6 +177,7 @@  EXPORT_SYMBOL(copy_user_generic_string)
  */
 ENTRY(copy_user_enhanced_fast_string)
 	ASM_STAC
+	ASM_IFENCE
 	cmpl $64,%edx
 	jb .L_copy_short_string	/* less then 64 bytes, avoid the costly 'rep' */
 	movl %edx,%ecx
diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S
index c97d935a29e8..85f400b8ee7c 100644
--- a/arch/x86/lib/getuser.S
+++ b/arch/x86/lib/getuser.S
@@ -41,6 +41,7 @@  ENTRY(__get_user_1)
 	cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX
 	jae bad_get_user
 	ASM_STAC
+	ASM_IFENCE
 1:	movzbl (%_ASM_AX),%edx
 	xor %eax,%eax
 	ASM_CLAC
@@ -55,6 +56,7 @@  ENTRY(__get_user_2)
 	cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX
 	jae bad_get_user
 	ASM_STAC
+	ASM_IFENCE
 2:	movzwl -1(%_ASM_AX),%edx
 	xor %eax,%eax
 	ASM_CLAC
@@ -69,6 +71,7 @@  ENTRY(__get_user_4)
 	cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX
 	jae bad_get_user
 	ASM_STAC
+	ASM_IFENCE
 3:	movl -3(%_ASM_AX),%edx
 	xor %eax,%eax
 	ASM_CLAC
@@ -84,6 +87,7 @@  ENTRY(__get_user_8)
 	cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX
 	jae bad_get_user
 	ASM_STAC
+	ASM_IFENCE
 4:	movq -7(%_ASM_AX),%rdx
 	xor %eax,%eax
 	ASM_CLAC
@@ -95,6 +99,7 @@  ENTRY(__get_user_8)
 	cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX
 	jae bad_get_user_8
 	ASM_STAC
+	ASM_IFENCE
 4:	movl -7(%_ASM_AX),%edx
 5:	movl -3(%_ASM_AX),%ecx
 	xor %eax,%eax
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index 1b377f734e64..7add8ba06887 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -331,12 +331,12 @@  do {									\
 
 unsigned long __copy_user_ll(void *to, const void *from, unsigned long n)
 {
-	stac();
+	__uaccess_begin_nospec();
 	if (movsl_is_ok(to, from, n))
 		__copy_user(to, from, n);
 	else
 		n = __copy_user_intel(to, from, n);
-	clac();
+	__uaccess_end();
 	return n;
 }
 EXPORT_SYMBOL(__copy_user_ll);
@@ -344,7 +344,7 @@  EXPORT_SYMBOL(__copy_user_ll);
 unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *from,
 					unsigned long n)
 {
-	stac();
+	__uaccess_begin_nospec();
 #ifdef CONFIG_X86_INTEL_USERCOPY
 	if (n > 64 && static_cpu_has(X86_FEATURE_XMM2))
 		n = __copy_user_intel_nocache(to, from, n);
@@ -353,7 +353,7 @@  unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *fr
 #else
 	__copy_user(to, from, n);
 #endif
-	clac();
+	__uaccess_end();
 	return n;
 }
 EXPORT_SYMBOL(__copy_from_user_ll_nocache_nozero);