diff mbox

[v2,06/27] x86/entry/64: Adapt assembly for PIE support

Message ID 20180313205945.245105-7-thgarnie@google.com (mailing list archive)
State New, archived
Headers show

Commit Message

Thomas Garnier March 13, 2018, 8:59 p.m. UTC
Change the assembly code to use only relative references of symbols for the
kernel to be PIE compatible.

Position Independent Executable (PIE) support will allow to extended the
KASLR randomization range below the -2G memory limit.

Signed-off-by: Thomas Garnier <thgarnie@google.com>
---
 arch/x86/entry/entry_64.S            | 16 ++++++++++------
 arch/x86/kernel/relocate_kernel_64.S |  8 +++-----
 2 files changed, 13 insertions(+), 11 deletions(-)

Comments

Peter Zijlstra March 14, 2018, 10:29 a.m. UTC | #1
On Tue, Mar 13, 2018 at 01:59:24PM -0700, Thomas Garnier wrote:
> @@ -1576,7 +1578,9 @@ first_nmi:
>  	addq	$8, (%rsp)	/* Fix up RSP */
>  	pushfq			/* RFLAGS */
>  	pushq	$__KERNEL_CS	/* CS */
> -	pushq	$1f		/* RIP */
> +	pushq	%rax		/* Support Position Independent Code */
> +	leaq	1f(%rip), %rax	/* RIP */
> +	xchgq	%rax, (%rsp)	/* Restore RAX, put 1f */
>  	iretq			/* continues at repeat_nmi below */
>  	UNWIND_HINT_IRET_REGS
>  1:

Urgh, xchg with a memop has an implicit LOCK prefix.
Christoph Lameter (Ampere) March 14, 2018, 3:54 p.m. UTC | #2
On Wed, 14 Mar 2018, Peter Zijlstra wrote:

> On Tue, Mar 13, 2018 at 01:59:24PM -0700, Thomas Garnier wrote:
> > @@ -1576,7 +1578,9 @@ first_nmi:
> >  	addq	$8, (%rsp)	/* Fix up RSP */
> >  	pushfq			/* RFLAGS */
> >  	pushq	$__KERNEL_CS	/* CS */
> > -	pushq	$1f		/* RIP */
> > +	pushq	%rax		/* Support Position Independent Code */
> > +	leaq	1f(%rip), %rax	/* RIP */
> > +	xchgq	%rax, (%rsp)	/* Restore RAX, put 1f */
> >  	iretq			/* continues at repeat_nmi below */
> >  	UNWIND_HINT_IRET_REGS
> >  1:
>
> Urgh, xchg with a memop has an implicit LOCK prefix.

this_cpu_xchg uses no lock cmpxchg as a replacement to reduce latency.

From linux/arch/x86/include/asm/percpu.h

/*
 * xchg is implemented using cmpxchg without a lock prefix. xchg is
 * expensive due to the implied lock prefix.  The processor cannot prefetch
 * cachelines if xchg is used.
 */
#define percpu_xchg_op(var, nval)                                       \
({                                                                      \
        typeof(var) pxo_ret__;                                          \
        typeof(var) pxo_new__ = (nval);                                 \
        switch (sizeof(var)) {                                          \
        case 1:                                                         \
                asm("\n\tmov "__percpu_arg(1)",%%al"                    \
                    "\n1:\tcmpxchgb %2, "__percpu_arg(1)                \
                    "\n\tjnz 1b"                                        \
                            : "=&a" (pxo_ret__), "+m" (var)             \
                            : "q" (pxo_new__)                           \
                            : "memory");                                \
                break;                                                  \
        case 2:                                                         \
                asm("\n\tmov "__percpu_arg(1)",%%ax"                    \
                    "\n1:\tcmpxchgw %2, "__percpu_arg(1)                \
                    "\n\tjnz 1b"                                        \
                            : "=&a" (pxo_ret__), "+m" (var)             \
                            : "r" (pxo_new__)                           \
                            : "memory");                                \
                break;                                                  \
        case 4:                                                         \
                asm("\n\tmov "__percpu_arg(1)",%%eax"                   \
                    "\n1:\tcmpxchgl %2, "__percpu_arg(1)                \
                    "\n\tjnz 1b"                                        \
                            : "=&a" (pxo_ret__), "+m" (var)             \
                            : "r" (pxo_new__)                           \
                            : "memory");                                \
                break;                                                  \
        case 8:                                                         \
                asm("\n\tmov "__percpu_arg(1)",%%rax"                   \
                    "\n1:\tcmpxchgq %2, "__percpu_arg(1)                \
                    "\n\tjnz 1b"                                        \
                            : "=&a" (pxo_ret__), "+m" (var)             \
                            : "r" (pxo_new__)                           \
                            : "memory");                                \
                break;                                                  \
        default: __bad_percpu_size();                                   \
        }                                                               \
        pxo_ret__;                                                      \
Thomas Garnier March 14, 2018, 5:09 p.m. UTC | #3
On Wed, Mar 14, 2018 at 8:55 AM Christopher Lameter <cl@linux.com> wrote:



> On Wed, 14 Mar 2018, Peter Zijlstra wrote:

> > On Tue, Mar 13, 2018 at 01:59:24PM -0700, Thomas Garnier wrote:
> > > @@ -1576,7 +1578,9 @@ first_nmi:
> > >     addq    $8, (%rsp)      /* Fix up RSP */
> > >     pushfq                  /* RFLAGS */
> > >     pushq   $__KERNEL_CS    /* CS */
> > > -   pushq   $1f             /* RIP */
> > > +   pushq   %rax            /* Support Position Independent Code */
> > > +   leaq    1f(%rip), %rax  /* RIP */
> > > +   xchgq   %rax, (%rsp)    /* Restore RAX, put 1f */
> > >     iretq                   /* continues at repeat_nmi below */
> > >     UNWIND_HINT_IRET_REGS
> > >  1:
> >
> > Urgh, xchg with a memop has an implicit LOCK prefix.

> this_cpu_xchg uses no lock cmpxchg as a replacement to reduce latency.

Great, I will update my implementation.

Thanks Peter and Christoph.


>  From linux/arch/x86/include/asm/percpu.h

> /*
>   * xchg is implemented using cmpxchg without a lock prefix. xchg is
>   * expensive due to the implied lock prefix.  The processor cannot
prefetch
>   * cachelines if xchg is used.
>   */
> #define percpu_xchg_op(var, nval)                                       \
> ({                                                                      \
>          typeof(var) pxo_ret__;                                          \
>          typeof(var) pxo_new__ = (nval);                                 \
>          switch (sizeof(var)) {                                          \
>          case 1:                                                         \
>                  asm("\n\tmov "__percpu_arg(1)",%%al"                    \
>                      "\n1:\tcmpxchgb %2, "__percpu_arg(1)                \
>                      "\n\tjnz 1b"                                        \
>                              : "=&a" (pxo_ret__), "+m" (var)             \
>                              : "q" (pxo_new__)                           \
>                              : "memory");                                \
>                  break;                                                  \
>          case 2:                                                         \
>                  asm("\n\tmov "__percpu_arg(1)",%%ax"                    \
>                      "\n1:\tcmpxchgw %2, "__percpu_arg(1)                \
>                      "\n\tjnz 1b"                                        \
>                              : "=&a" (pxo_ret__), "+m" (var)             \
>                              : "r" (pxo_new__)                           \
>                              : "memory");                                \
>                  break;                                                  \
>          case 4:                                                         \
>                  asm("\n\tmov "__percpu_arg(1)",%%eax"                   \
>                      "\n1:\tcmpxchgl %2, "__percpu_arg(1)                \
>                      "\n\tjnz 1b"                                        \
>                              : "=&a" (pxo_ret__), "+m" (var)             \
>                              : "r" (pxo_new__)                           \
>                              : "memory");                                \
>                  break;                                                  \
>          case 8:                                                         \
>                  asm("\n\tmov "__percpu_arg(1)",%%rax"                   \
>                      "\n1:\tcmpxchgq %2, "__percpu_arg(1)                \
>                      "\n\tjnz 1b"                                        \
>                              : "=&a" (pxo_ret__), "+m" (var)             \
>                              : "r" (pxo_new__)                           \
>                              : "memory");                                \
>                  break;                                                  \
>          default: __bad_percpu_size();                                   \
>          }                                                               \
>          pxo_ret__;                                                      \
Paolo Bonzini March 15, 2018, 10:18 a.m. UTC | #4
On 14/03/2018 16:54, Christopher Lameter wrote:
>>> +	pushq	%rax		/* Support Position Independent Code */
>>> +	leaq	1f(%rip), %rax	/* RIP */
>>> +	xchgq	%rax, (%rsp)	/* Restore RAX, put 1f */
>>>  	iretq			/* continues at repeat_nmi below */
>>>  	UNWIND_HINT_IRET_REGS
>>>  1:
>> Urgh, xchg with a memop has an implicit LOCK prefix.
> this_cpu_xchg uses no lock cmpxchg as a replacement to reduce latency.

That requires using a second register, since %rax is used as the
comparison source.  At this point it's easier to just push %rax twice:

	pushq %rax
	pushq %rax
	leaq 1f(%ip), %rax
	movq %rax, 8(%rsp)
	popq %rax
	iretq

Paolo
diff mbox

Patch

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index bd53c57617e6..c53123468364 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -191,7 +191,7 @@  ENTRY(entry_SYSCALL_64_trampoline)
 	 * spill RDI and restore it in a second-stage trampoline.
 	 */
 	pushq	%rdi
-	movq	$entry_SYSCALL_64_stage2, %rdi
+	movabsq	$entry_SYSCALL_64_stage2, %rdi
 	JMP_NOSPEC %rdi
 END(entry_SYSCALL_64_trampoline)
 
@@ -1275,7 +1275,8 @@  ENTRY(error_entry)
 	movl	%ecx, %eax			/* zero extend */
 	cmpq	%rax, RIP+8(%rsp)
 	je	.Lbstep_iret
-	cmpq	$.Lgs_change, RIP+8(%rsp)
+	leaq	.Lgs_change(%rip), %rcx
+	cmpq	%rcx, RIP+8(%rsp)
 	jne	.Lerror_entry_done
 
 	/*
@@ -1480,10 +1481,10 @@  ENTRY(nmi)
 	 * resume the outer NMI.
 	 */
 
-	movq	$repeat_nmi, %rdx
+	leaq	repeat_nmi(%rip), %rdx
 	cmpq	8(%rsp), %rdx
 	ja	1f
-	movq	$end_repeat_nmi, %rdx
+	leaq	end_repeat_nmi(%rip), %rdx
 	cmpq	8(%rsp), %rdx
 	ja	nested_nmi_out
 1:
@@ -1537,7 +1538,8 @@  nested_nmi:
 	pushq	%rdx
 	pushfq
 	pushq	$__KERNEL_CS
-	pushq	$repeat_nmi
+	leaq	repeat_nmi(%rip), %rdx
+	pushq	%rdx
 
 	/* Put stack back */
 	addq	$(6*8), %rsp
@@ -1576,7 +1578,9 @@  first_nmi:
 	addq	$8, (%rsp)	/* Fix up RSP */
 	pushfq			/* RFLAGS */
 	pushq	$__KERNEL_CS	/* CS */
-	pushq	$1f		/* RIP */
+	pushq	%rax		/* Support Position Independent Code */
+	leaq	1f(%rip), %rax	/* RIP */
+	xchgq	%rax, (%rsp)	/* Restore RAX, put 1f */
 	iretq			/* continues at repeat_nmi below */
 	UNWIND_HINT_IRET_REGS
 1:
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index a7227dfe1a2b..0c0fc259a4e2 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -208,11 +208,9 @@  identity_mapped:
 	movq	%rax, %cr3
 	lea	PAGE_SIZE(%r8), %rsp
 	call	swap_pages
-	jmp	*virtual_mapped_addr(%rip)
-
-	/* Absolute value for PIE support */
-virtual_mapped_addr:
-	.quad virtual_mapped
+	movabsq $virtual_mapped, %rax
+	pushq	%rax
+	ret
 
 virtual_mapped:
 	movq	RSP(%r8), %rsp