Message ID | 20180313205945.245105-7-thgarnie@google.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Tue, Mar 13, 2018 at 01:59:24PM -0700, Thomas Garnier wrote: > @@ -1576,7 +1578,9 @@ first_nmi: > addq $8, (%rsp) /* Fix up RSP */ > pushfq /* RFLAGS */ > pushq $__KERNEL_CS /* CS */ > - pushq $1f /* RIP */ > + pushq %rax /* Support Position Independent Code */ > + leaq 1f(%rip), %rax /* RIP */ > + xchgq %rax, (%rsp) /* Restore RAX, put 1f */ > iretq /* continues at repeat_nmi below */ > UNWIND_HINT_IRET_REGS > 1: Urgh, xchg with a memop has an implicit LOCK prefix.
On Wed, 14 Mar 2018, Peter Zijlstra wrote: > On Tue, Mar 13, 2018 at 01:59:24PM -0700, Thomas Garnier wrote: > > @@ -1576,7 +1578,9 @@ first_nmi: > > addq $8, (%rsp) /* Fix up RSP */ > > pushfq /* RFLAGS */ > > pushq $__KERNEL_CS /* CS */ > > - pushq $1f /* RIP */ > > + pushq %rax /* Support Position Independent Code */ > > + leaq 1f(%rip), %rax /* RIP */ > > + xchgq %rax, (%rsp) /* Restore RAX, put 1f */ > > iretq /* continues at repeat_nmi below */ > > UNWIND_HINT_IRET_REGS > > 1: > > Urgh, xchg with a memop has an implicit LOCK prefix. this_cpu_xchg uses no lock cmpxchg as a replacement to reduce latency. From linux/arch/x86/include/asm/percpu.h /* * xchg is implemented using cmpxchg without a lock prefix. xchg is * expensive due to the implied lock prefix. The processor cannot prefetch * cachelines if xchg is used. */ #define percpu_xchg_op(var, nval) \ ({ \ typeof(var) pxo_ret__; \ typeof(var) pxo_new__ = (nval); \ switch (sizeof(var)) { \ case 1: \ asm("\n\tmov "__percpu_arg(1)",%%al" \ "\n1:\tcmpxchgb %2, "__percpu_arg(1) \ "\n\tjnz 1b" \ : "=&a" (pxo_ret__), "+m" (var) \ : "q" (pxo_new__) \ : "memory"); \ break; \ case 2: \ asm("\n\tmov "__percpu_arg(1)",%%ax" \ "\n1:\tcmpxchgw %2, "__percpu_arg(1) \ "\n\tjnz 1b" \ : "=&a" (pxo_ret__), "+m" (var) \ : "r" (pxo_new__) \ : "memory"); \ break; \ case 4: \ asm("\n\tmov "__percpu_arg(1)",%%eax" \ "\n1:\tcmpxchgl %2, "__percpu_arg(1) \ "\n\tjnz 1b" \ : "=&a" (pxo_ret__), "+m" (var) \ : "r" (pxo_new__) \ : "memory"); \ break; \ case 8: \ asm("\n\tmov "__percpu_arg(1)",%%rax" \ "\n1:\tcmpxchgq %2, "__percpu_arg(1) \ "\n\tjnz 1b" \ : "=&a" (pxo_ret__), "+m" (var) \ : "r" (pxo_new__) \ : "memory"); \ break; \ default: __bad_percpu_size(); \ } \ pxo_ret__; \
On Wed, Mar 14, 2018 at 8:55 AM Christopher Lameter <cl@linux.com> wrote: > On Wed, 14 Mar 2018, Peter Zijlstra wrote: > > On Tue, Mar 13, 2018 at 01:59:24PM -0700, Thomas Garnier wrote: > > > @@ -1576,7 +1578,9 @@ first_nmi: > > > addq $8, (%rsp) /* Fix up RSP */ > > > pushfq /* RFLAGS */ > > > pushq $__KERNEL_CS /* CS */ > > > - pushq $1f /* RIP */ > > > + pushq %rax /* Support Position Independent Code */ > > > + leaq 1f(%rip), %rax /* RIP */ > > > + xchgq %rax, (%rsp) /* Restore RAX, put 1f */ > > > iretq /* continues at repeat_nmi below */ > > > UNWIND_HINT_IRET_REGS > > > 1: > > > > Urgh, xchg with a memop has an implicit LOCK prefix. > this_cpu_xchg uses no lock cmpxchg as a replacement to reduce latency. Great, I will update my implementation. Thanks Peter and Christoph. > From linux/arch/x86/include/asm/percpu.h > /* > * xchg is implemented using cmpxchg without a lock prefix. xchg is > * expensive due to the implied lock prefix. The processor cannot prefetch > * cachelines if xchg is used. > */ > #define percpu_xchg_op(var, nval) \ > ({ \ > typeof(var) pxo_ret__; \ > typeof(var) pxo_new__ = (nval); \ > switch (sizeof(var)) { \ > case 1: \ > asm("\n\tmov "__percpu_arg(1)",%%al" \ > "\n1:\tcmpxchgb %2, "__percpu_arg(1) \ > "\n\tjnz 1b" \ > : "=&a" (pxo_ret__), "+m" (var) \ > : "q" (pxo_new__) \ > : "memory"); \ > break; \ > case 2: \ > asm("\n\tmov "__percpu_arg(1)",%%ax" \ > "\n1:\tcmpxchgw %2, "__percpu_arg(1) \ > "\n\tjnz 1b" \ > : "=&a" (pxo_ret__), "+m" (var) \ > : "r" (pxo_new__) \ > : "memory"); \ > break; \ > case 4: \ > asm("\n\tmov "__percpu_arg(1)",%%eax" \ > "\n1:\tcmpxchgl %2, "__percpu_arg(1) \ > "\n\tjnz 1b" \ > : "=&a" (pxo_ret__), "+m" (var) \ > : "r" (pxo_new__) \ > : "memory"); \ > break; \ > case 8: \ > asm("\n\tmov "__percpu_arg(1)",%%rax" \ > "\n1:\tcmpxchgq %2, "__percpu_arg(1) \ > "\n\tjnz 1b" \ > : "=&a" (pxo_ret__), "+m" (var) \ > : "r" (pxo_new__) \ > : "memory"); \ > break; \ > default: __bad_percpu_size(); \ > } \ > pxo_ret__; \
On 14/03/2018 16:54, Christopher Lameter wrote: >>> + pushq %rax /* Support Position Independent Code */ >>> + leaq 1f(%rip), %rax /* RIP */ >>> + xchgq %rax, (%rsp) /* Restore RAX, put 1f */ >>> iretq /* continues at repeat_nmi below */ >>> UNWIND_HINT_IRET_REGS >>> 1: >> Urgh, xchg with a memop has an implicit LOCK prefix. > this_cpu_xchg uses no lock cmpxchg as a replacement to reduce latency. That requires using a second register, since %rax is used as the comparison source. At this point it's easier to just push %rax twice: pushq %rax pushq %rax leaq 1f(%ip), %rax movq %rax, 8(%rsp) popq %rax iretq Paolo
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index bd53c57617e6..c53123468364 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -191,7 +191,7 @@ ENTRY(entry_SYSCALL_64_trampoline) * spill RDI and restore it in a second-stage trampoline. */ pushq %rdi - movq $entry_SYSCALL_64_stage2, %rdi + movabsq $entry_SYSCALL_64_stage2, %rdi JMP_NOSPEC %rdi END(entry_SYSCALL_64_trampoline) @@ -1275,7 +1275,8 @@ ENTRY(error_entry) movl %ecx, %eax /* zero extend */ cmpq %rax, RIP+8(%rsp) je .Lbstep_iret - cmpq $.Lgs_change, RIP+8(%rsp) + leaq .Lgs_change(%rip), %rcx + cmpq %rcx, RIP+8(%rsp) jne .Lerror_entry_done /* @@ -1480,10 +1481,10 @@ ENTRY(nmi) * resume the outer NMI. */ - movq $repeat_nmi, %rdx + leaq repeat_nmi(%rip), %rdx cmpq 8(%rsp), %rdx ja 1f - movq $end_repeat_nmi, %rdx + leaq end_repeat_nmi(%rip), %rdx cmpq 8(%rsp), %rdx ja nested_nmi_out 1: @@ -1537,7 +1538,8 @@ nested_nmi: pushq %rdx pushfq pushq $__KERNEL_CS - pushq $repeat_nmi + leaq repeat_nmi(%rip), %rdx + pushq %rdx /* Put stack back */ addq $(6*8), %rsp @@ -1576,7 +1578,9 @@ first_nmi: addq $8, (%rsp) /* Fix up RSP */ pushfq /* RFLAGS */ pushq $__KERNEL_CS /* CS */ - pushq $1f /* RIP */ + pushq %rax /* Support Position Independent Code */ + leaq 1f(%rip), %rax /* RIP */ + xchgq %rax, (%rsp) /* Restore RAX, put 1f */ iretq /* continues at repeat_nmi below */ UNWIND_HINT_IRET_REGS 1: diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index a7227dfe1a2b..0c0fc259a4e2 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -208,11 +208,9 @@ identity_mapped: movq %rax, %cr3 lea PAGE_SIZE(%r8), %rsp call swap_pages - jmp *virtual_mapped_addr(%rip) - - /* Absolute value for PIE support */ -virtual_mapped_addr: - .quad virtual_mapped + movabsq $virtual_mapped, %rax + pushq %rax + ret virtual_mapped: movq RSP(%r8), %rsp
Change the assembly code to use only relative references of symbols for the kernel to be PIE compatible. Position Independent Executable (PIE) support will allow to extended the KASLR randomization range below the -2G memory limit. Signed-off-by: Thomas Garnier <thgarnie@google.com> --- arch/x86/entry/entry_64.S | 16 ++++++++++------ arch/x86/kernel/relocate_kernel_64.S | 8 +++----- 2 files changed, 13 insertions(+), 11 deletions(-)