Message ID | 20230111123736.20025-9-kirill.shutemov@linux.intel.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | Linear Address Masking enabling | expand |
On Wed, Jan 11, 2023 at 03:37:27PM +0300, Kirill A. Shutemov wrote: > #define __untagged_addr(untag_mask, addr) ({ \ > u64 __addr = (__force u64)(addr); \ > - s64 sign = (s64)__addr >> 63; \ > - __addr &= untag_mask | sign; \ > + if (static_branch_likely(&tagged_addr_key)) { \ > + s64 sign = (s64)__addr >> 63; \ > + __addr &= untag_mask | sign; \ > + } \ > (__force __typeof__(addr))__addr; \ > }) > > #define untagged_addr(addr) __untagged_addr(current_untag_mask(), addr) Is the compiler clever enough to put the memop inside the branch?
On Tue, Jan 17, 2023 at 02:05:22PM +0100, Peter Zijlstra wrote: > On Wed, Jan 11, 2023 at 03:37:27PM +0300, Kirill A. Shutemov wrote: > > > #define __untagged_addr(untag_mask, addr) > > u64 __addr = (__force u64)(addr); \ > > - s64 sign = (s64)__addr >> 63; \ > > - __addr &= untag_mask | sign; \ > > + if (static_branch_likely(&tagged_addr_key)) { \ > > + s64 sign = (s64)__addr >> 63; \ > > + __addr &= untag_mask | sign; \ > > + } \ > > (__force __typeof__(addr))__addr; \ > > }) > > > > #define untagged_addr(addr) __untagged_addr(current_untag_mask(), addr) > > Is the compiler clever enough to put the memop inside the branch? Hm. You mean current_untag_mask() inside static_branch_likely()? But it is preprocessor who does this, not compiler. So, yes, the memop is inside the branch. Or I didn't understand your question.
On Tue, Jan 17, 2023 at 04:57:03PM +0300, Kirill A. Shutemov wrote: > On Tue, Jan 17, 2023 at 02:05:22PM +0100, Peter Zijlstra wrote: > > On Wed, Jan 11, 2023 at 03:37:27PM +0300, Kirill A. Shutemov wrote: > > > > > #define __untagged_addr(untag_mask, addr) > > > u64 __addr = (__force u64)(addr); \ > > > - s64 sign = (s64)__addr >> 63; \ > > > - __addr &= untag_mask | sign; \ > > > + if (static_branch_likely(&tagged_addr_key)) { \ > > > + s64 sign = (s64)__addr >> 63; \ > > > + __addr &= untag_mask | sign; \ > > > + } \ > > > (__force __typeof__(addr))__addr; \ > > > }) > > > > > > #define untagged_addr(addr) __untagged_addr(current_untag_mask(), addr) > > > > Is the compiler clever enough to put the memop inside the branch? > > Hm. You mean current_untag_mask() inside static_branch_likely()? > > But it is preprocessor who does this, not compiler. So, yes, the memop is > inside the branch. > > Or I didn't understand your question. Nah, call it a pre-lunch dip, I overlooked the whole CPP angle -- d'0h. That said, I did just put it through a compiler to see wth it did and it is pretty gross: GCC-12.2: 0000 00000000000023b0 <write_ok_or_segv>: 0000 23b0: 48 89 fa mov %rdi,%rdx 0003 23b3: eb 76 jmp 242b <write_ok_or_segv+0x7b> 0005 23b5: 65 48 8b 0d 00 00 00 00 mov %gs:0x0(%rip),%rcx # 23bd <write_ok_or_segv+0xd> 23b9: R_X86_64_PC32 tlbstate_untag_mask-0x4 000d 23bd: 48 89 f8 mov %rdi,%rax 0010 23c0: 48 c1 f8 3f sar $0x3f,%rax 0014 23c4: 48 09 c8 or %rcx,%rax 0017 23c7: 48 21 f8 and %rdi,%rax 001a 23ca: 48 b9 00 f0 ff ff ff 7f 00 00 movabs $0x7ffffffff000,%rcx 0024 23d4: 48 39 f1 cmp %rsi,%rcx 0027 23d7: 72 14 jb 23ed <write_ok_or_segv+0x3d> 0029 23d9: 48 29 f1 sub %rsi,%rcx 002c 23dc: be 01 00 00 00 mov $0x1,%esi 0031 23e1: 48 39 c1 cmp %rax,%rcx 0034 23e4: 72 07 jb 23ed <write_ok_or_segv+0x3d> 0036 23e6: 89 f0 mov %esi,%eax 0038 23e8: e9 00 00 00 00 jmp 23ed <write_ok_or_segv+0x3d> 23e9: R_X86_64_PLT32 __x86_return_thunk-0x4 003d 23ed: 65 48 8b 04 25 00 00 00 00 mov %gs:0x0,%rax 23f2: R_X86_64_32S pcpu_hot 0046 23f6: 48 89 90 68 0b 00 00 mov %rdx,0xb68(%rax) 004d 23fd: be 01 00 00 00 mov $0x1,%esi 0052 2402: bf 0b 00 00 00 mov $0xb,%edi 0057 2407: 48 c7 80 78 0b 00 00 06 00 00 00 movq $0x6,0xb78(%rax) 0062 2412: 48 c7 80 70 0b 00 00 0e 00 00 00 movq $0xe,0xb70(%rax) 006d 241d: e8 00 00 00 00 call 2422 <write_ok_or_segv+0x72> 241e: R_X86_64_PLT32 force_sig_fault-0x4 0072 2422: 31 f6 xor %esi,%esi 0074 2424: 89 f0 mov %esi,%eax 0076 2426: e9 00 00 00 00 jmp 242b <write_ok_or_segv+0x7b> 2427: R_X86_64_PLT32 __x86_return_thunk-0x4 007b 242b: 48 89 f8 mov %rdi,%rax 007e 242e: eb 9a jmp 23ca <write_ok_or_segv+0x1a> Note the stupid jump to the end and back. Not all sites do this mind you, but a fair number seem to do it. Let me try llvm to see if it is any smarter. CLANG-16: 0000 0000000000002d50 <write_ok_or_segv>: 0000 2d50: 41 57 push %r15 0002 2d52: 41 56 push %r14 0004 2d54: 41 54 push %r12 0006 2d56: 53 push %rbx 0007 2d57: 48 89 f3 mov %rsi,%rbx 000a 2d5a: 48 89 fa mov %rdi,%rdx 000d 2d5d: 49 89 fe mov %rdi,%r14 0010 2d60: eb 15 jmp 2d77 <write_ok_or_segv+0x27> 0012 2d62: 48 89 d0 mov %rdx,%rax 0015 2d65: 48 c1 f8 3f sar $0x3f,%rax 0019 2d69: 65 4c 8b 35 00 00 00 00 mov %gs:0x0(%rip),%r14 # 2d71 <write_ok_or_segv+0x21> 2d6d: R_X86_64_PC32 tlbstate_untag_mask-0x4 0021 2d71: 49 09 c6 or %rax,%r14 0024 2d74: 49 21 d6 and %rdx,%r14 0027 2d77: f3 0f 1e fa endbr64 002b 2d7b: 49 bf 00 f0 ff ff ff 7f 00 00 movabs $0x7ffffffff000,%r15 0035 2d85: 4d 89 fc mov %r15,%r12 0038 2d88: 49 29 dc sub %rbx,%r12 003b 2d8b: 72 05 jb 2d92 <write_ok_or_segv+0x42> 003d 2d8d: 4d 39 f4 cmp %r14,%r12 0040 2d90: 73 34 jae 2dc6 <write_ok_or_segv+0x76> 0042 2d92: 65 48 8b 05 00 00 00 00 mov %gs:0x0(%rip),%rax # 2d9a <write_ok_or_segv+0x4a> 2d96: R_X86_64_PC32 pcpu_hot-0x4 004a 2d9a: 48 c7 80 78 0b 00 00 06 00 00 00 movq $0x6,0xb78(%rax) 0055 2da5: 48 89 90 68 0b 00 00 mov %rdx,0xb68(%rax) 005c 2dac: 48 c7 80 70 0b 00 00 0e 00 00 00 movq $0xe,0xb70(%rax) 0067 2db7: bf 0b 00 00 00 mov $0xb,%edi 006c 2dbc: be 01 00 00 00 mov $0x1,%esi 0071 2dc1: e8 00 00 00 00 call 2dc6 <write_ok_or_segv+0x76> 2dc2: R_X86_64_PLT32 force_sig_fault-0x4 0076 2dc6: 4d 39 f4 cmp %r14,%r12 0079 2dc9: 0f 93 c1 setae %cl 007c 2dcc: 49 39 df cmp %rbx,%r15 007f 2dcf: 0f 93 c0 setae %al 0082 2dd2: 20 c8 and %cl,%al 0084 2dd4: 5b pop %rbx 0085 2dd5: 41 5c pop %r12 0087 2dd7: 41 5e pop %r14 0089 2dd9: 41 5f pop %r15 008b 2ddb: e9 00 00 00 00 jmp 2de0 <__pfx_get_gate_vma> 2ddc: R_X86_64_PLT32 __x86_return_thunk-0x4 Well, it got the untag right, but OMG.. :-( Joao, Sami, any idea why it put an ENDBR in there?
On Tue, Jan 17, 2023 at 7:02 AM Peter Zijlstra <peterz@infradead.org> wrote: > > On Tue, Jan 17, 2023 at 04:57:03PM +0300, Kirill A. Shutemov wrote: > > On Tue, Jan 17, 2023 at 02:05:22PM +0100, Peter Zijlstra wrote: > > > On Wed, Jan 11, 2023 at 03:37:27PM +0300, Kirill A. Shutemov wrote: > > > > > > > #define __untagged_addr(untag_mask, addr) > > > > u64 __addr = (__force u64)(addr); \ > > > > - s64 sign = (s64)__addr >> 63; \ > > > > - __addr &= untag_mask | sign; \ > > > > + if (static_branch_likely(&tagged_addr_key)) { \ > > > > + s64 sign = (s64)__addr >> 63; \ > > > > + __addr &= untag_mask | sign; \ > > > > + } \ > > > > (__force __typeof__(addr))__addr; \ > > > > }) > > > > > > > > #define untagged_addr(addr) __untagged_addr(current_untag_mask(), addr) > > > > > > Is the compiler clever enough to put the memop inside the branch? > > > > Hm. You mean current_untag_mask() inside static_branch_likely()? > > > > But it is preprocessor who does this, not compiler. So, yes, the memop is > > inside the branch. > > > > Or I didn't understand your question. > > Nah, call it a pre-lunch dip, I overlooked the whole CPP angle -- d'0h. > > That said, I did just put it through a compiler to see wth it did and it > is pretty gross: Yeah, I think the static branch likely just makes things worse. And if we really want to make the "no untag mask exists" case better, I think the code should probably use static_branch_unlikely() rather than *_likely(). That should make it jump to the masking code, and leave the unmasked code as a fallthrough, no? The reason clang seems to generate saner code is that clang seems to largely ignore the whole "__builtin_expect()", at least not to the point where it tries to make the unlikely case be out-of-line. But on the whole, I think we'd be better off without this whole static branch. The cost of "untagged_addr()" generally shouldn't be worth this. There are few performance-crticial users - the most common case is, I think, just mmap() and friends, and the single load is going to be a non-issue there. Looking around, I think the only situation where we may care is strnlen_user() and strncpy_from_user(). Those *can* be performance-critical. They're used for paths and for execve() strings, and can be a bit hot. And both of those cases actually just use it because of the whole "maximum address" calculation to avoid traversing into kernel addresses, so I wonder if we could use alternatives there, kind of like the get_user/put_user cases did. Except it's generic code, so .. But maybe even those aren't worth worrying about. At least they do the unmasking outside the loop - although then in the case of execve(), the string copies themselves are obviously done in a loop anyway. Kirill, do you have clear numbers for that static key being a noticeable win? Linus
On Tue, Jan 17, 2023 at 9:18 AM Linus Torvalds <torvalds@linux-foundation.org> wrote: > > The reason clang seems to generate saner code is that clang seems to > largely ignore the whole "__builtin_expect()", at least not to the > point where it tries to make the unlikely case be out-of-line. Side note: that's not something new or unusual. It's been the case since I started testing clang - we have several code-paths where we use "unlikely()" to try to get very unlikely cases to be out-of-line, and clang just mostly ignores it, or treats it as a very weak hint. I think the only way to get clang to treat it as a *strong* hint is to use PGO. And in this case it actually made code generation look better, probably because this particular use of static_branch_likely() is a bit confused about which side should be the preferred one. It's using the static branch to make the old case not have the masked load, but then it's saying that the new case is the likely one. So clang ignoring the likely() hint is probably the right thing here, and then the wrong thing in some other places. Linus
On Tue, Jan 17, 2023 at 09:18:01AM -0800, Linus Torvalds wrote: > The reason clang seems to generate saner code is that clang seems to > largely ignore the whole "__builtin_expect()", at least not to the > point where it tries to make the unlikely case be out-of-line. So in this case there is only a 'likely' hint, we're explicitly trying to keep the thing in-line so we can jump over it. It is GCC that generated an implicit else (and marked it 'unlikely' -- which we didn't ask for), but worse, it failed to spot the else case is in fact shared with the normal case and it could've simply lifted that mov instruction. That is, instead of this: 0003 23b3: eb 76 jmp 242b <write_ok_or_segv+0x7b> 0005 23b5: 65 48 8b 0d 00 00 00 00 mov %gs:0x0(%rip),%rcx # 23bd <write_ok_or_segv+0xd> 23b9: R_X86_64_PC32 tlbstate_untag_mask-0x4 000d 23bd: 48 89 f8 mov %rdi,%rax 0010 23c0: 48 c1 f8 3f sar $0x3f,%rax 0014 23c4: 48 09 c8 or %rcx,%rax 0017 23c7: 48 21 f8 and %rdi,%rax 001a 23ca: 48 b9 00 f0 ff ff ff 7f 00 00 movabs $0x7ffffffff000,%rcx 007b 242b: 48 89 f8 mov %rdi,%rax 007e 242e: eb 9a jmp 23ca <write_ok_or_segv+0x1a> It could've just done: 0003 48 89 f8 mov %rdi,%rax 0006 eb 76 jmp +18 0008 65 48 8b 0d 00 00 00 00 mov %gs:0x0(%rip),%rcx # 23bd <write_ok_or_segv+0xd> 23b9: R_X86_64_PC32 tlbstate_untag_mask-0x4 0010 48 c1 f8 3f sar $0x3f,%rax 0014 48 09 c8 or %rcx,%rax 0017 48 21 f8 and %rdi,%rax 001a 48 b9 00 f0 ff ff ff 7f 00 00 movabs $0x7ffffffff000,%rcx and everything would've been good. In all the cases I've seen it do this, it was the same, it has this silly move out of line that's also part of the regular branch. That is, I like __builtin_expect() to be a strong hint. If I don't want things out of line, I shouldn't have put unlikely on it. What I don't like is that implicit else branches get the opposite strong hint. What I like even less is that it found it needed that else branch at all.
On Tue, Jan 17, 2023 at 09:18:01AM -0800, Linus Torvalds wrote: > But maybe even those aren't worth worrying about. At least they do the > unmasking outside the loop - although then in the case of execve(), > the string copies themselves are obviously done in a loop anyway. > > Kirill, do you have clear numbers for that static key being a noticeable win? Numbers would be good; I think this all started as a precaution, but clearly that's not really working out so well :/
On Tue, Jan 17, 2023 at 9:29 AM Linus Torvalds <torvalds@linux-foundation.org> wrote: > > On Tue, Jan 17, 2023 at 9:18 AM Linus Torvalds > <torvalds@linux-foundation.org> wrote: > > > > The reason clang seems to generate saner code is that clang seems to > > largely ignore the whole "__builtin_expect()", at least not to the > > point where it tries to make the unlikely case be out-of-line. > > Side note: that's not something new or unusual. It's been the case > since I started testing clang - we have several code-paths where we > use "unlikely()" to try to get very unlikely cases to be out-of-line, > and clang just mostly ignores it, or treats it as a very weak hint. I > think the only way to get clang to treat it as a *strong* hint is to > use PGO. I'd be surprised if that were intentional or by design. Do you guys have a bug report we could look at? > So clang ignoring the likely() hint is probably the right thing here, > and then the wrong thing in some other places.
On Tue, Jan 17, 2023 at 10:26 AM Nick Desaulniers <ndesaulniers@google.com> wrote: > > On Tue, Jan 17, 2023 at 9:29 AM Linus Torvalds > <torvalds@linux-foundation.org> wrote: > > > > Side note: that's not something new or unusual. It's been the case > > since I started testing clang - we have several code-paths where we > > use "unlikely()" to try to get very unlikely cases to be out-of-line, > > and clang just mostly ignores it, or treats it as a very weak hint. I > > think the only way to get clang to treat it as a *strong* hint is to > > use PGO. > > I'd be surprised if that were intentional or by design. > > Do you guys have a bug report we could look at? Heh. I actually sent you an example long ago. Let me go fish it out of my mail archives and quote some of it below so that you can find it in yours.. Linus [ Time passes. Found this email to you and Bill Wendling from Feb 16, 2020, Message-ID CAHk-=wigVshsByCMjkUiZyQSR5N5zi2aAeQc+VJCzQV=nm8E7g@mail.gmail.com ]: Anyway, I'm looking at clang code generation, and comparing it with gcc on one of my "this has been optimized to hell and back" functions: __d_lookup_rcu(). It looks like clang does frame pointers, and ignores our likely/unlikely annotations. So this code: if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) { int tlen; const char *tname; ...... doesn't actually jump out of line, but instead generates the unlikely case as the fallthrough: testb $2, (%r12) je .LBB50_9 ... unlikely code goes here... and then the likely case ends up having unfortunate reloads inside the hot loop. Possibly because it has one fewer free registers than gcc because of the frame pointer. I didn't look into _why_ clang generates frame pointers but gcc doesn't. It may be just a compiler default, I think we don't end up explicitly asking either way. [ And then Bill replied with this ] It's not a no-op. We add branch probabilities to the IR, whether they're honored or not depends on the transformation. But they *should* be honored when available. I've seen in the past that instead of moving unlikely blocks out of the way (like gcc, which moves them below the function's "ret" instruction), LLVM will do something like this: <normal code> <jmp to loop conditional test> <unlikely code> <more likely code> <loop conditional test> <...> I.e. the loop is rotated and the unlikely code is first and the hotter code is closer together but between the unlikely and conditional test. Could this be what's going on? Otherwise, maybe clang decided that it's not beneficial to move the code out-of-line because the benefit was minimal? (I'm guessing here.)
On Tue, Jan 17, 2023 at 10:34 AM Linus Torvalds <torvalds@linux-foundation.org> wrote: > > On Tue, Jan 17, 2023 at 10:26 AM Nick Desaulniers > <ndesaulniers@google.com> wrote: > > > > On Tue, Jan 17, 2023 at 9:29 AM Linus Torvalds > > <torvalds@linux-foundation.org> wrote: > > > > > > Side note: that's not something new or unusual. It's been the case > > > since I started testing clang - we have several code-paths where we > > > use "unlikely()" to try to get very unlikely cases to be out-of-line, > > > and clang just mostly ignores it, or treats it as a very weak hint. I > > > think the only way to get clang to treat it as a *strong* hint is to > > > use PGO. > > > > I'd be surprised if that were intentional or by design. > > > > Do you guys have a bug report we could look at? > > Heh. I actually sent you an example long ago. Let me go fish it out of > my mail archives and quote some of it below so that you can find it in > yours.. > > Linus > > [ Time passes. Found this email to you and Bill Wendling from Feb 16, > 2020, Message-ID > CAHk-=wigVshsByCMjkUiZyQSR5N5zi2aAeQc+VJCzQV=nm8E7g@mail.gmail.com ]: > > Anyway, I'm looking at clang code generation, and comparing it with > gcc on one of my "this has been optimized to hell and back" functions: > __d_lookup_rcu(). > > It looks like clang does frame pointers, and ignores our > likely/unlikely annotations. > > So this code: > > if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) { > int tlen; > const char *tname; > ...... > > doesn't actually jump out of line, but instead generates the unlikely > case as the fallthrough: > > testb $2, (%r12) > je .LBB50_9 > ... unlikely code goes here... Perhaps that was compiler version or config specific? $ make LLVM=1 -j128 defconfig fs/dcache.o $ llvm-objdump -d --no-show-raw-insn --disassemble-symbols=__d_lookup_rcu fs/dcache.o 0000000000003210 <__d_lookup_rcu>: 3210: endbr64 3214: pushq %rbp 3215: pushq %r15 3217: pushq %r14 3219: pushq %r12 321b: pushq %rbx 321c: testb $0x2, (%rdi) 321f: jne 0x32d7 <__d_lookup_rcu+0xc7> ... 32d7: popq %rbx 32d8: popq %r12 32da: popq %r14 32dc: popq %r15 32de: popq %rbp 32df: jmp 0x3300 <__d_lookup_rcu_op_compare> That looks like what you want, yeah? Your original report was from nearly 3 years ago; could have fixed a few instances of branch weights not getting propagated since then. What's going on in this case in this thread? I know we don't support hot/cold attributes on labels yet, but if static_branch_likely (or friends) is being used, we assign the indirect branches a 0% likeliness/branch-weight. > > and then the likely case ends up having unfortunate reloads inside the > hot loop. Possibly because it has one fewer free registers than gcc > because of the frame pointer. > > I didn't look into _why_ clang generates frame pointers but gcc > doesn't. It may be just a compiler default, I think we don't end up > explicitly asking either way. > > [ And then Bill replied with this ] > > It's not a no-op. We add branch probabilities to the IR, whether > they're honored or not depends on the transformation. But they > *should* be honored when available. I've seen in the past that instead > of moving unlikely blocks out of the way (like gcc, which moves them > below the function's "ret" instruction), LLVM will do something like > this: > > <normal code> > <jmp to loop conditional test> > <unlikely code> > <more likely code> > <loop conditional test> > <...> > > I.e. the loop is rotated and the unlikely code is first and the hotter > code is closer together but between the unlikely and conditional test. > Could this be what's going on? Otherwise, maybe clang decided that > it's not beneficial to move the code out-of-line because the benefit > was minimal? (I'm guessing here.)
On Tue, Jan 17, 2023 at 11:17 AM Nick Desaulniers <ndesaulniers@google.com> wrote: > > Perhaps that was compiler version or config specific? Possible, but... The clang code generation annoyed me enough that I actually ended up rewriting the unlikely test to be outside the loop in commit ae2a823643d7 ("dcache: move the DCACHE_OP_COMPARE case out of the __d_lookup_rcu loop"). I think that then made clang no longer have the whole "rotate loop with unlikely case in the middle" issue. And then because clang *still* messed up by trying to be too clever (see https://lore.kernel.org/all/CAHk-=wjyOB66pofW0mfzDN7SO8zS1EMRZuR-_2aHeO+7kuSrAg@mail.gmail.com/ for details), I also ended up doing commit c4e34dd99f2e ("x86: simplify load_unaligned_zeropad() implementation"). The end result is that now the compiler almost *cannot* mess up any more. So the reason clang now does a good job on __d_lookup_rcu() is largely that I took away all the places where it did badly ;) That said, clang still generates more register pressure than gcc, causing the function prologue and epilogue to be rather bigger (pushing and popping six registers, as opposed to gcc that only needs three) Gcc is also better able to schedule the prologue and epilogue together with the work of the function, which clang seems to always do it as a "push all" and "pop all" sequence. That scheduling doesn't matter in that particular place (although it does make the unlikely case of calling __d_lookup_rcu_op_compare pointlessly push all regs only to then pop them), but I've seen a few other cases where it ends up meaning that it always does that full function prologue even when the *likely* case then returns early and doesn't actually need any of that work because it didn't use any of those registers. But yeah, the RCU pathname lookup looks fine these days. And I don't actually think it was due to clang changes ;) Linus
On Tue, Jan 17, 2023 at 12:10 PM Linus Torvalds <torvalds@linux-foundation.org> wrote: > > That said, clang still generates more register pressure than gcc, > causing the function prologue and epilogue to be rather bigger > (pushing and popping six registers, as opposed to gcc that only needs > three) .. and at least part of that is the same thing with the bad byte mask generation (see that "clang *still* messes up" link for details). Basically, the byte mask is computed by mask = bytemask_from_count(tcount); where we have #define bytemask_from_count(cnt) (~(~0ul << (cnt)*8)) and clang tries very very hard to avoid that "multiply by 8", so instead it keeps a shadow copy of that "(cnt)*8" value in the loop. That is wrong for a couple of reasons: (a) it adds register pressure for no good reason (b) when you shift left by that value, only the low 6 bits of that value matters And guess how that "tcount" is updated? It's this: tcount -= sizeof(unsigned long); in the loop, and thus the update of that shadow value of "(cnt)*8" is done as addl $-64, %ecx inside that loop. This is truly stupid and wasted work, because the low 6 bits of the value - remember, the only part that matters - DOES NOT CHANGE when you do that. So clang has decided that it needs to (a) avoid the "expensive" multiply-by-8 at the end by turning it into a repeated "add $-64" inside the loop (b) added register pressure and one extra instruction inside the loop (c) not realized that that extra instruction doesn't actually *do* anything, because it only affects the bits that don't actually matter in the end. which is all kind of silly, wouldn't you agree. Every single step there was pointless. But with my other simplifications, the fact that clang does these extra things is no longer all that noticeable. It *used* to be a horrible disaster because the extra register pressure ended up meaning that you had spills and all kinds of nastiness. Now the function is simple enough that even with the extra register pressure, there's no need for spills. .. until you look at the 32-bit version, which still needs spills. Gcc does too, but clang just makes it worse by having the extra pointless shadow variable. If I cared about 32-bit, I might write up a bugzilla entry. As it is, it's just "clang tries to be clever, and in the process is actually being stupid". Linus
On Tue, Jan 17, 2023 at 04:02:06PM +0100, Peter Zijlstra wrote: > On Tue, Jan 17, 2023 at 04:57:03PM +0300, Kirill A. Shutemov wrote: > > On Tue, Jan 17, 2023 at 02:05:22PM +0100, Peter Zijlstra wrote: > > > On Wed, Jan 11, 2023 at 03:37:27PM +0300, Kirill A. Shutemov wrote: > > > > > > > #define __untagged_addr(untag_mask, addr) > > > > u64 __addr = (__force u64)(addr); \ > > > > - s64 sign = (s64)__addr >> 63; \ > > > > - __addr &= untag_mask | sign; \ > > > > + if (static_branch_likely(&tagged_addr_key)) { \ > > > > + s64 sign = (s64)__addr >> 63; \ > > > > + __addr &= untag_mask | sign; \ > > > > + } \ > > > > (__force __typeof__(addr))__addr; \ > > > > }) > > > > > > > > #define untagged_addr(addr) __untagged_addr(current_untag_mask(), addr) > > > > > > Is the compiler clever enough to put the memop inside the branch? > > > > Hm. You mean current_untag_mask() inside static_branch_likely()? > > > > But it is preprocessor who does this, not compiler. So, yes, the memop is > > inside the branch. > > > > Or I didn't understand your question. > > Nah, call it a pre-lunch dip, I overlooked the whole CPP angle -- d'0h. > > That said, I did just put it through a compiler to see wth it did and it > is pretty gross: I tried to replace static branch with alternative. It kinda works, but required few hack. Thanks to Andrew Cooper for helping to untangle them. I am not sure if it worth the effort. I don't have any evidence that it helps. untagged_addr() overhead is rather small and hides in noise of syscall cost. I only made alternative for untagged_addr(), but not for untagged_addr_remote(). _remote() case has very few users. BTW, it would be nice to be able to apply alternative later, delaying it until the first user of LAM, like I did with static_branch. We don't have a way to do this right? Any opinions? I am okay dropping the patch altogether. diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index c44b56f7ffba..3f0c31044f02 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -75,6 +75,12 @@ # define DISABLE_CALL_DEPTH_TRACKING (1 << (X86_FEATURE_CALL_DEPTH & 31)) #endif +#ifdef CONFIG_ADDRESS_MASKING +# define DISABLE_LAM 0 +#else +# define DISABLE_LAM (1 << (X86_FEATURE_LAM & 31)) +#endif + #ifdef CONFIG_INTEL_IOMMU_SVM # define DISABLE_ENQCMD 0 #else @@ -115,7 +121,7 @@ #define DISABLED_MASK10 0 #define DISABLED_MASK11 (DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET| \ DISABLE_CALL_DEPTH_TRACKING) -#define DISABLED_MASK12 0 +#define DISABLED_MASK12 (DISABLE_LAM) #define DISABLED_MASK13 0 #define DISABLED_MASK14 0 #define DISABLED_MASK15 0 diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index f9f85d596581..57ccb91fcccf 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -9,6 +9,7 @@ #include <linux/kasan-checks.h> #include <linux/mm_types.h> #include <linux/string.h> +#include <linux/mmap_lock.h> #include <asm/asm.h> #include <asm/page.h> #include <asm/smap.h> @@ -24,28 +25,48 @@ static inline bool pagefault_disabled(void); #endif #ifdef CONFIG_ADDRESS_MASKING -DECLARE_STATIC_KEY_FALSE(tagged_addr_key); +static inline unsigned long __untagged_addr(unsigned long addr) +{ + /* + * Magic with the 'sign' allows to untag userspace pointer without + * any branches while leaving kernel addresses intact. + */ + long sign; + + /* + * Refer tlbstate_untag_mask directly to avoid RIP-relative relocation + * in alternative instructions. The relocation gets wrong when gets + * copied to the target place. + */ + asm (ALTERNATIVE("", + "sar $63, %[sign]\n\t" /* user_ptr ? 0 : -1UL */ + "or %%gs:tlbstate_untag_mask, %[sign]\n\t" + "and %[sign], %[addr]\n\t", X86_FEATURE_LAM) + : [addr] "+r" (addr), [sign] "=r" (sign) + : "m" (tlbstate_untag_mask), "[sign]" (addr)); + + return addr; +} -/* - * Mask out tag bits from the address. - * - * Magic with the 'sign' allows to untag userspace pointer without any branches - * while leaving kernel addresses intact. - */ -#define __untagged_addr(untag_mask, addr) ({ \ - u64 __addr = (__force u64)(addr); \ - if (static_branch_likely(&tagged_addr_key)) { \ - s64 sign = (s64)__addr >> 63; \ - __addr &= untag_mask | sign; \ - } \ - (__force __typeof__(addr))__addr; \ +#define untagged_addr(addr) ({ \ + unsigned long __addr = (__force unsigned long)(addr); \ + (__force __typeof__(addr))__untagged_addr(__addr); \ }) -#define untagged_addr(addr) __untagged_addr(current_untag_mask(), addr) +static inline unsigned long __untagged_addr_remote(struct mm_struct *mm, + unsigned long addr) +{ + long sign = addr >> 63; + + mmap_assert_locked(mm); + addr &= (mm)->context.untag_mask | sign; + + return addr; +} #define untagged_addr_remote(mm, addr) ({ \ - mmap_assert_locked(mm); \ - __untagged_addr((mm)->context.untag_mask, addr); \ + unsigned long __addr = (__force unsigned long)(addr); \ + (__force __typeof__(addr))__untagged_addr_remote(mm, __addr); \ }) #else diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 0831d2be190f..e006725afdf1 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -745,9 +745,6 @@ static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr) #ifdef CONFIG_ADDRESS_MASKING -DEFINE_STATIC_KEY_FALSE(tagged_addr_key); -EXPORT_SYMBOL_GPL(tagged_addr_key); - #define LAM_U57_BITS 6 static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits) @@ -787,8 +784,6 @@ static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits) set_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags); mmap_write_unlock(mm); - - static_branch_enable(&tagged_addr_key); return 0; } #endif
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 32c9dd052e43..f9f85d596581 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -24,6 +24,8 @@ static inline bool pagefault_disabled(void); #endif #ifdef CONFIG_ADDRESS_MASKING +DECLARE_STATIC_KEY_FALSE(tagged_addr_key); + /* * Mask out tag bits from the address. * @@ -32,8 +34,10 @@ static inline bool pagefault_disabled(void); */ #define __untagged_addr(untag_mask, addr) ({ \ u64 __addr = (__force u64)(addr); \ - s64 sign = (s64)__addr >> 63; \ - __addr &= untag_mask | sign; \ + if (static_branch_likely(&tagged_addr_key)) { \ + s64 sign = (s64)__addr >> 63; \ + __addr &= untag_mask | sign; \ + } \ (__force __typeof__(addr))__addr; \ }) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 88aae519c8f8..1b41c60ebf6e 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -745,6 +745,9 @@ static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr) #ifdef CONFIG_ADDRESS_MASKING +DEFINE_STATIC_KEY_FALSE(tagged_addr_key); +EXPORT_SYMBOL_GPL(tagged_addr_key); + #define LAM_U57_BITS 6 static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits) @@ -781,6 +784,7 @@ static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits) mmap_write_unlock(mm); + static_branch_enable(&tagged_addr_key); return 0; } #endif
Use static key to reduce untagged_addr() overhead. The key only gets enabled when the first process enables LAM. Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> --- arch/x86/include/asm/uaccess.h | 8 ++++++-- arch/x86/kernel/process_64.c | 4 ++++ 2 files changed, 10 insertions(+), 2 deletions(-)