diff mbox series

[RFC,3/7] x86/entry: Implement atomic-IST-entry

Message ID 20230403140605.540512-4-jiangshanlai@gmail.com (mailing list archive)
State New, archived
Headers show
Series x86/entry: Atomic statck switching for IST | expand

Commit Message

Lai Jiangshan April 3, 2023, 2:06 p.m. UTC
From: Lai Jiangshan <jiangshan.ljs@antgroup.com>

See the comments in the cover-letter.  They will be moved into the code
and changelog here when improved.

Signed-off-by: Lai Jiangshan <jiangshan.ljs@antgroup.com>
---
 arch/x86/entry/Makefile          |   3 +
 arch/x86/entry/entry_64.S        | 193 ++++++++++++++++++++
 arch/x86/entry/ist_entry.c       | 299 +++++++++++++++++++++++++++++++
 arch/x86/kernel/asm-offsets_64.c |   7 +
 arch/x86/kernel/callthunks.c     |   2 +
 tools/objtool/check.c            |   7 +-
 6 files changed, 510 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/entry/ist_entry.c

Comments

Peter Zijlstra April 6, 2023, 9:01 p.m. UTC | #1
On Mon, Apr 03, 2023 at 10:06:01PM +0800, Lai Jiangshan wrote:

> diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile
> index ca2fe186994b..7cc1254ca519 100644
> --- a/arch/x86/entry/Makefile
> +++ b/arch/x86/entry/Makefile
> @@ -8,11 +8,14 @@ UBSAN_SANITIZE := n
>  KCOV_INSTRUMENT := n
>  
>  CFLAGS_REMOVE_common.o		= $(CC_FLAGS_FTRACE)
> +CFLAGS_REMOVE_ist_entry.o	= $(CC_FLAGS_FTRACE) $(RETHUNK_CFLAGS)

This ^^^


> diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
> index 49ddc4dd3117..50a24cc83581 100644
> --- a/arch/x86/entry/entry_64.S
> +++ b/arch/x86/entry/entry_64.S
> @@ -443,6 +443,184 @@ SYM_CODE_END(\asmsym)

> +.macro idtentry_ist vector asmsym cfunc user_cfunc has_error_code:req, stack_offset:req
> +SYM_CODE_START(\asmsym)
> +	UNWIND_HINT_IRET_REGS offset=\has_error_code*8
> +	ENDBR
> +
> +	/*
> +	 * Clear X86_EFLAGS_AC, X86_EFLAGS_DF and set a default ORIG_RAX.
> +	 *
> +	 * The code setting ORIG_RAX will not be replicated if interrupted.
> +	 */
> +	ASM_CLAC
> +	cld
> +
> +	.if \has_error_code == 0
> +		pushq	$-1		/* ORIG_RAX: no syscall to restart */
> +	.endif
> +
> +	/*
> +	 * No register can be touched except %rsp,%rflags,%rip before
> +	 * pushing all the registers.  It is indispensable for nested
> +	 * atomic-IST-entry to replicate pushing the registers.
> +	 */
> +	PUSH_REGS
> +
> +	/*
> +	 * Finished pushing register, all registers can be touched by now.
> +	 *
> +	 * Clear registers for the C function ist_copy_regs_to_main_stack()
> +	 * and the handler to avoid any possible exploitation of any
> +	 * speculation attack.
> +	 */
> +	CLEAR_REGS
> +
> +	/*
> +	 * Copy the pt_regs to the IST main stack including the pt_regs of
> +	 * the interrupted atomic-IST-entris, if any, by replicating.
> +	 */
> +	movq	%rsp, %rdi				/* pt_regs pointer on its own IST stack */
> +	leaq	PTREGS_SIZE-\stack_offset(%rsp), %rsi	/* struct cea_exception_stacks pointer */
> +	call	ist_copy_regs_to_main_stack

IIUC you do a CALL+RET here, before you call paranoid_entry ...

> +
> +	/*
> +	 * Commit stage.
> +	 */
> +SYM_INNER_LABEL(start_commit_\asmsym, SYM_L_GLOBAL)
> +	/*
> +	 * Switches to the IST main stack.  Before the switching is done,
> +	 * %rax is the copied pt_regs pointer in IST main stack.
> +	 */
> +	movq	%rax, %rsp
> +
> +	/*
> +	 * The label should be immediate after the instruction that switches
> +	 * the stack since there is code assuming there is only one single
> +	 * instruction in the commit stage and the code assumes "%rsp in the
> +	 * IST main stack is also the sign of ending a atomic-IST-entry".
> +	 * (The code will be removed in future when %rip-based identifying
> +	 * is added.)
> +	 */
> +SYM_INNER_LABEL(commit_\asmsym, SYM_L_GLOBAL)
> +
> +	/*
> +	 * Now, it is on the IST main stack.  For the whole kernel, the entries
> +	 * of the IST exceptions can be seen from here because the inside
> +	 * of the atomic-IST-entry can not be seen from the whole kernel
> +	 * except in the atomic-IST-entry or #DF.
> +	 */
> +	UNWIND_HINT_REGS
> +	ENCODE_FRAME_POINTER
> +
> +	/*
> +	 * The code setting ORIG_RAX will not be replicated if interrupted.
> +	 * So redo it here.
> +	 */
> +	.if \has_error_code == 0
> +		movq	$-1, ORIG_RAX(%rsp)	/* ORIG_RAX: no syscall to restart */
> +	.endif
> +
> +	/*
> +	 * If the entry is from userspace, switch stacks and treat it as
> +	 * a normal entry.
> +	 */
> +	testb	$3, CS(%rsp)
> +	jnz	.Lfrom_usermode_switch_stack_\@
> +
> +	/*
> +	 * paranoid_entry returns GS/CR3/SPEC_CTL information for
> +	 * paranoid_exit in RBX/R14/R15.
> +	 */
> +	call	paranoid_entry

... all the way down here, which will do:

  IBRS_ENTER;
  UNTRAIN_RET_FROM_CALL;

Which thus breaks the whole RetBleed mess, since that must not do RET
before that happens.
Peter Zijlstra April 6, 2023, 9:58 p.m. UTC | #2
On Mon, Apr 03, 2023 at 10:06:01PM +0800, Lai Jiangshan wrote:
> +static __always_inline
> +void copy_regs_exception_head(struct pt_regs *target, const struct pt_regs *from)
> +{
> +	target->ss	= from->ss;
> +	target->sp	= from->sp;
> +	target->flags 	= from->flags;
> +	target->cs	= from->cs;
> +	target->ip	= from->ip;
> +	target->orig_ax	= from->orig_ax;
> +}
> +
> +static __always_inline
> +void copy_regs_general_registers(struct pt_regs *target, const struct pt_regs *from)
> +{
> +	target->di  = from->di;
> +	target->si  = from->si;
> +	target->dx  = from->dx;
> +	target->cx  = from->cx;
> +	target->ax  = from->ax;
> +	target->r8  = from->r8;
> +	target->r9  = from->r9;
> +	target->r10 = from->r10;
> +	target->r11 = from->r11;
> +	target->bx  = from->bx;
> +	target->bp  = from->bp;
> +	target->r12 = from->r12;
> +	target->r13 = from->r13;
> +	target->r14 = from->r14;
> +	target->r15 = from->r15;
> +}

> +/* Replicate the interrupted atomic-IST-entry's CLEAR_REGS macro. */
> +static __always_inline void replicate_clear_regs(struct pt_regs *target)
> +{
> +	target->di  = 0;
> +	target->si  = 0;
> +	target->dx  = 0;
> +	target->cx  = 0;
> +	target->ax  = 0;
> +	target->r8  = 0;
> +	target->r9  = 0;
> +	target->r10 = 0;
> +	target->r11 = 0;
> +	target->bx  = 0;
> +	target->bp  = 0;
> +	target->r12 = 0;
> +	target->r13 = 0;
> +	target->r14 = 0;
> +	target->r15 = 0;
> +}

I think there's compilers smart enough to see through your attempts at
avoiding mem{set,cpy}() there and I think we'll end up needing something
like __inline_memset() and __inline_memcpy() like here:

https://lore.kernel.org/lkml/Y759AJ%2F0N9fqwDED@hirez.programming.kicks-ass.net/
Andrew Cooper April 6, 2023, 11:07 p.m. UTC | #3
On 06/04/2023 10:58 pm, Peter Zijlstra wrote:
> On Mon, Apr 03, 2023 at 10:06:01PM +0800, Lai Jiangshan wrote:
>> +/* Replicate the interrupted atomic-IST-entry's CLEAR_REGS macro. */
>> +static __always_inline void replicate_clear_regs(struct pt_regs *target)
>> +{
>> +	target->di  = 0;
>> +	target->si  = 0;
>> +	target->dx  = 0;
>> +	target->cx  = 0;
>> +	target->ax  = 0;
>> +	target->r8  = 0;
>> +	target->r9  = 0;
>> +	target->r10 = 0;
>> +	target->r11 = 0;
>> +	target->bx  = 0;
>> +	target->bp  = 0;
>> +	target->r12 = 0;
>> +	target->r13 = 0;
>> +	target->r14 = 0;
>> +	target->r15 = 0;
>> +}
> I think there's compilers smart enough to see through your attempts at
> avoiding mem{set,cpy}() there

Indeed.  It took a little bit of convincing (needed 4 extra registers to
zero), but https://godbolt.org/z/7rvb8db66

Including Peter's other observation about speculation safety, you
basically can't have any C at all before passing IBRS/UNRET/whatever
else comes along in the future.

Otherwise, the compiler will make you wish you'd written it in asm the
first time around.

~Andrew
Lai Jiangshan April 7, 2023, 2:33 a.m. UTC | #4
On Fri, Apr 7, 2023 at 5:01 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Mon, Apr 03, 2023 at 10:06:01PM +0800, Lai Jiangshan wrote:
>
> > diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile
> > index ca2fe186994b..7cc1254ca519 100644
> > --- a/arch/x86/entry/Makefile
> > +++ b/arch/x86/entry/Makefile
> > @@ -8,11 +8,14 @@ UBSAN_SANITIZE := n
> >  KCOV_INSTRUMENT := n
> >
> >  CFLAGS_REMOVE_common.o               = $(CC_FLAGS_FTRACE)
> > +CFLAGS_REMOVE_ist_entry.o    = $(CC_FLAGS_FTRACE) $(RETHUNK_CFLAGS)
>
> This ^^^
>
>
> > diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
> > index 49ddc4dd3117..50a24cc83581 100644
> > --- a/arch/x86/entry/entry_64.S
> > +++ b/arch/x86/entry/entry_64.S
> > @@ -443,6 +443,184 @@ SYM_CODE_END(\asmsym)
>
> > +.macro idtentry_ist vector asmsym cfunc user_cfunc has_error_code:req, stack_offset:req
> > +SYM_CODE_START(\asmsym)
> > +     UNWIND_HINT_IRET_REGS offset=\has_error_code*8
> > +     ENDBR
> > +
> > +     /*
> > +      * Clear X86_EFLAGS_AC, X86_EFLAGS_DF and set a default ORIG_RAX.
> > +      *
> > +      * The code setting ORIG_RAX will not be replicated if interrupted.
> > +      */
> > +     ASM_CLAC
> > +     cld
> > +
> > +     .if \has_error_code == 0
> > +             pushq   $-1             /* ORIG_RAX: no syscall to restart */
> > +     .endif
> > +
> > +     /*
> > +      * No register can be touched except %rsp,%rflags,%rip before
> > +      * pushing all the registers.  It is indispensable for nested
> > +      * atomic-IST-entry to replicate pushing the registers.
> > +      */
> > +     PUSH_REGS
> > +
> > +     /*
> > +      * Finished pushing register, all registers can be touched by now.
> > +      *
> > +      * Clear registers for the C function ist_copy_regs_to_main_stack()
> > +      * and the handler to avoid any possible exploitation of any
> > +      * speculation attack.
> > +      */
> > +     CLEAR_REGS
> > +
> > +     /*
> > +      * Copy the pt_regs to the IST main stack including the pt_regs of
> > +      * the interrupted atomic-IST-entris, if any, by replicating.
> > +      */
> > +     movq    %rsp, %rdi                              /* pt_regs pointer on its own IST stack */
> > +     leaq    PTREGS_SIZE-\stack_offset(%rsp), %rsi   /* struct cea_exception_stacks pointer */
> > +     call    ist_copy_regs_to_main_stack
>
> IIUC you do a CALL+RET here, before you call paranoid_entry ...
>
> > +
> > +     /*
> > +      * Commit stage.
> > +      */
> > +SYM_INNER_LABEL(start_commit_\asmsym, SYM_L_GLOBAL)
> > +     /*
> > +      * Switches to the IST main stack.  Before the switching is done,
> > +      * %rax is the copied pt_regs pointer in IST main stack.
> > +      */
> > +     movq    %rax, %rsp
> > +
> > +     /*
> > +      * The label should be immediate after the instruction that switches
> > +      * the stack since there is code assuming there is only one single
> > +      * instruction in the commit stage and the code assumes "%rsp in the
> > +      * IST main stack is also the sign of ending a atomic-IST-entry".
> > +      * (The code will be removed in future when %rip-based identifying
> > +      * is added.)
> > +      */
> > +SYM_INNER_LABEL(commit_\asmsym, SYM_L_GLOBAL)
> > +
> > +     /*
> > +      * Now, it is on the IST main stack.  For the whole kernel, the entries
> > +      * of the IST exceptions can be seen from here because the inside
> > +      * of the atomic-IST-entry can not be seen from the whole kernel
> > +      * except in the atomic-IST-entry or #DF.
> > +      */
> > +     UNWIND_HINT_REGS
> > +     ENCODE_FRAME_POINTER
> > +
> > +     /*
> > +      * The code setting ORIG_RAX will not be replicated if interrupted.
> > +      * So redo it here.
> > +      */
> > +     .if \has_error_code == 0
> > +             movq    $-1, ORIG_RAX(%rsp)     /* ORIG_RAX: no syscall to restart */
> > +     .endif
> > +
> > +     /*
> > +      * If the entry is from userspace, switch stacks and treat it as
> > +      * a normal entry.
> > +      */
> > +     testb   $3, CS(%rsp)
> > +     jnz     .Lfrom_usermode_switch_stack_\@
> > +
> > +     /*
> > +      * paranoid_entry returns GS/CR3/SPEC_CTL information for
> > +      * paranoid_exit in RBX/R14/R15.
> > +      */
> > +     call    paranoid_entry
>
> ... all the way down here, which will do:
>
>   IBRS_ENTER;
>   UNTRAIN_RET_FROM_CALL;
>
> Which thus breaks the whole RetBleed mess, since that must not do RET
> before that happens.

I got it.
I will add the save-stage-3 in the atomic-IST-entry.

The benefit of the new stage:
  Do CR3/GSBASE/IBRS switching in the C atomic-IST-entry.
  (^^^^^ Also the drawback, which complicates the code, and basically needs:
   https://lore.kernel.org/lkml/20211126101209.8613-1-jiangshanlai@gmail.com/ )
  The IST main stack can be in the Kernel CR3, not necessarily in the CEA
diff mbox series

Patch

diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile
index ca2fe186994b..7cc1254ca519 100644
--- a/arch/x86/entry/Makefile
+++ b/arch/x86/entry/Makefile
@@ -8,11 +8,14 @@  UBSAN_SANITIZE := n
 KCOV_INSTRUMENT := n
 
 CFLAGS_REMOVE_common.o		= $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_ist_entry.o	= $(CC_FLAGS_FTRACE) $(RETHUNK_CFLAGS)
 
 CFLAGS_common.o			+= -fno-stack-protector
+CFLAGS_ist_entry.o		+= -fno-stack-protector
 
 obj-y				:= entry.o entry_$(BITS).o syscall_$(BITS).o
 obj-y				+= common.o
+obj-$(CONFIG_X86_64)		+= ist_entry.o
 
 obj-y				+= vdso/
 obj-y				+= vsyscall/
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 49ddc4dd3117..50a24cc83581 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -443,6 +443,184 @@  SYM_CODE_END(\asmsym)
 	idtentry \vector asm_\cfunc \cfunc has_error_code=0
 .endm
 
+/**
+ * idtentry_ist - Macro to generate entry stubs for IST exceptions except #DF
+ * @vector:		Vector number
+ * @asmsym:		ASM symbol for the entry point
+ * @cfunc:		C function to be called when it occurs in kernel
+ * @user_cfunc:		C function to be called when it occurs in userspace
+ * @has_error_code:	Hardware pushed error code on stack
+ * @stack_offset:	Offset of the IST stack top in struct cea_exception_stacks
+ *
+ * The macro emits code to set up the kernel context for IST exceptions.
+ *
+ * From the hardware entry of the event to the SYM_INNER_LABEL(commit_\asmsym)
+ * is atomic-IST-entry (note: atomic-IST-entry is from the hardware entry,
+ * not merely from the first instruction of this macro).
+ *
+ * The atomic-IST-entry pushes pt_regs and copies the pt_regs to the IST
+ * main stack, and switches to it.  If the atomic-IST-entry is interrupted
+ * by another IST event (except #DF), the new atomic-IST-entry will
+ * replicate the interrupted one as if every atomic-IST-entry is atomic.
+ *
+ * See the comments in entry64.c.
+ *
+ * When the cpu is on any IST stack or the IST main stack, %rsp can not be
+ * switched off except being interrupted by any IST exception or totally
+ * switching off (no usable data left).
+ *
+ * If the entry comes from user space, it turns to use the normal entry
+ * path finally on its kernel stack including the return to user space
+ * work and preemption checks on exit.  The macro idtentry_body ensures
+ * the IST main stack is totally switched off (no usable data left) at
+ * the same time it switches to the kernel stack..
+ *
+ * If hits in kernel mode then it needs to go through the paranoid
+ * entry as the exception can hit any random state. No preemption
+ * check on exit to keep the paranoid path simple.
+ */
+.macro idtentry_ist vector asmsym cfunc user_cfunc has_error_code:req, stack_offset:req
+SYM_CODE_START(\asmsym)
+	UNWIND_HINT_IRET_REGS offset=\has_error_code*8
+	ENDBR
+
+	/*
+	 * Clear X86_EFLAGS_AC, X86_EFLAGS_DF and set a default ORIG_RAX.
+	 *
+	 * The code setting ORIG_RAX will not be replicated if interrupted.
+	 */
+	ASM_CLAC
+	cld
+
+	.if \has_error_code == 0
+		pushq	$-1		/* ORIG_RAX: no syscall to restart */
+	.endif
+
+	/*
+	 * No register can be touched except %rsp,%rflags,%rip before
+	 * pushing all the registers.  It is indispensable for nested
+	 * atomic-IST-entry to replicate pushing the registers.
+	 */
+	PUSH_REGS
+
+	/*
+	 * Finished pushing register, all registers can be touched by now.
+	 *
+	 * Clear registers for the C function ist_copy_regs_to_main_stack()
+	 * and the handler to avoid any possible exploitation of any
+	 * speculation attack.
+	 */
+	CLEAR_REGS
+
+	/*
+	 * Copy the pt_regs to the IST main stack including the pt_regs of
+	 * the interrupted atomic-IST-entris, if any, by replicating.
+	 */
+	movq	%rsp, %rdi				/* pt_regs pointer on its own IST stack */
+	leaq	PTREGS_SIZE-\stack_offset(%rsp), %rsi	/* struct cea_exception_stacks pointer */
+	call	ist_copy_regs_to_main_stack
+
+	/*
+	 * Commit stage.
+	 */
+SYM_INNER_LABEL(start_commit_\asmsym, SYM_L_GLOBAL)
+	/*
+	 * Switches to the IST main stack.  Before the switching is done,
+	 * %rax is the copied pt_regs pointer in IST main stack.
+	 */
+	movq	%rax, %rsp
+
+	/*
+	 * The label should be immediate after the instruction that switches
+	 * the stack since there is code assuming there is only one single
+	 * instruction in the commit stage and the code assumes "%rsp in the
+	 * IST main stack is also the sign of ending a atomic-IST-entry".
+	 * (The code will be removed in future when %rip-based identifying
+	 * is added.)
+	 */
+SYM_INNER_LABEL(commit_\asmsym, SYM_L_GLOBAL)
+
+	/*
+	 * Now, it is on the IST main stack.  For the whole kernel, the entries
+	 * of the IST exceptions can be seen from here because the inside
+	 * of the atomic-IST-entry can not be seen from the whole kernel
+	 * except in the atomic-IST-entry or #DF.
+	 */
+	UNWIND_HINT_REGS
+	ENCODE_FRAME_POINTER
+
+	/*
+	 * The code setting ORIG_RAX will not be replicated if interrupted.
+	 * So redo it here.
+	 */
+	.if \has_error_code == 0
+		movq	$-1, ORIG_RAX(%rsp)	/* ORIG_RAX: no syscall to restart */
+	.endif
+
+	/*
+	 * If the entry is from userspace, switch stacks and treat it as
+	 * a normal entry.
+	 */
+	testb	$3, CS(%rsp)
+	jnz	.Lfrom_usermode_switch_stack_\@
+
+	/*
+	 * paranoid_entry returns GS/CR3/SPEC_CTL information for
+	 * paranoid_exit in RBX/R14/R15.
+	 */
+	call	paranoid_entry
+
+	movq	%rsp, %rdi		/* pt_regs pointer */
+	.if \has_error_code == 1
+		movq	ORIG_RAX(%rsp), %rsi	/* get error code into 2nd argument*/
+		movq	$-1, ORIG_RAX(%rsp)	/* no syscall to restart */
+	.endif
+	call	\cfunc
+
+	jmp	paranoid_exit
+
+.Lfrom_usermode_switch_stack_\@:
+	/* Switch context: GS_BASE, CR3, SPEC_CTL. */
+	swapgs
+	FENCE_SWAPGS_USER_ENTRY
+	/* We have user CR3.  Change to kernel CR3. */
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
+	IBRS_ENTER
+	UNTRAIN_RET
+
+	/* Put the pt_regs onto the kernel task stack. */
+	movq	%rsp, %rdi			/* arg0 = pt_regs pointer */
+	call	sync_regs
+
+	/*
+	 * Switch to the kernel task stack and use the user entry point.
+	 *
+	 * When from the user mode, the procedure has to atomically switches
+	 * off the TSS-configured IST stacks too, so it switches to the IST
+	 * main stack first, and then switches off the IST main stack in atomic
+	 * fashion: when %rsp leaves the IST main stack, the IST main stack is
+	 * totally free.
+	 */
+	movq	%rax, %rsp
+	UNWIND_HINT_REGS
+	ENCODE_FRAME_POINTER
+
+	movq	%rsp, %rdi			/* pt_regs pointer into 1st argument*/
+	.if \has_error_code == 1
+		movq	ORIG_RAX(%rsp), %rsi	/* get error code into 2nd argument*/
+		movq	$-1, ORIG_RAX(%rsp)	/* no syscall to restart */
+	.endif
+	call	\user_cfunc
+
+	/* For some configurations \user_cfunc ends up being a noreturn. */
+	REACHABLE
+
+	jmp	error_return
+
+_ASM_NOKPROBE(\asmsym)
+SYM_CODE_END(\asmsym)
+.endm
+
 /**
  * idtentry_mce_db - Macro to generate entry stubs for #MC and #DB
  * @vector:		Vector number
@@ -586,8 +764,23 @@  SYM_CODE_END(\asmsym)
  */
 .macro idtentry_df vector asmsym cfunc
 SYM_CODE_START(\asmsym)
+
+	/*
+	 * This unwind-hint is incorect if it is the soft double fault rasied
+	 * from ist_double_fault().  It doesn't matter since it is unrecoverable
+	 * double fault.
+	 */
 	UNWIND_HINT_IRET_REGS offset=8
 	ENDBR
+
+	/*
+	 * Set %rsp = %rsp - 8 if it is the soft double fault raisied from
+	 * ist_double_fault().  The CPU doesn't push an error code in the case
+	 * since it is injected by an INT instruction.
+	 */
+	btr	$3, %rsp
+	UNWIND_HINT_IRET_REGS offset=8
+
 	ASM_CLAC
 	cld
 
diff --git a/arch/x86/entry/ist_entry.c b/arch/x86/entry/ist_entry.c
new file mode 100644
index 000000000000..e1b06306ac51
--- /dev/null
+++ b/arch/x86/entry/ist_entry.c
@@ -0,0 +1,299 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *  Copyright (C) 2022-2023 Lai Jiangshan, Ant Group
+ *
+ * Handle entries and exits for hardware traps and faults.
+ *
+ * It is as low level as entry_64.S and its code can be running in the
+ * environments that the GS base is a user controlled value, or the CR3
+ * is the PTI user CR3 or both.
+ */
+#include <asm/traps.h>
+
+#define IST_DOUBLE_FAULT_VECTOR 8
+
+static __always_inline void ist_double_fault(void)
+{
+	asm volatile ("int $" __stringify(IST_DOUBLE_FAULT_VECTOR));
+}
+
+#define IN_CEA_ESTACK(ceastp, name, sp)			\
+	((CEA_ESTACK_BOT(ceastp, name) <= (sp)) &&	\
+	 ((sp) < CEA_ESTACK_TOP(ceastp, name)))
+
+struct ist_ctx {
+	const struct pt_regs *regs;
+	unsigned long commit_ip;
+};
+
+#define DEFINE_IDENTIFY_IST(stack_name, sym_name, is_enabled)			\
+extern char commit_asm_exc_##sym_name[];					\
+static __always_inline bool identify_ist_##sym_name(				\
+		const struct pt_regs *regs, struct cea_exception_stacks *stacks,\
+		struct ist_ctx *ctx)						\
+{										\
+	if (!(is_enabled))							\
+		return false;							\
+	if (!IN_CEA_ESTACK(stacks, stack_name, regs->sp))			\
+		return false;							\
+	ctx->regs = (struct pt_regs *)CEA_ESTACK_TOP(stacks, stack_name) - 1;	\
+	ctx->commit_ip = (unsigned long)commit_asm_exc_##sym_name;		\
+	return true;								\
+}
+
+DEFINE_IDENTIFY_IST(NMI, nmi, false)
+DEFINE_IDENTIFY_IST(DB, debug, false)
+DEFINE_IDENTIFY_IST(MCE, machine_check, false)
+DEFINE_IDENTIFY_IST(VC, vmm_communication, false)
+
+static __always_inline bool identify_ist(
+		const struct pt_regs *regs, struct cea_exception_stacks *stacks,
+		struct ist_ctx *ctx)
+{
+	return	identify_ist_nmi(regs, stacks, ctx) ||
+		identify_ist_debug(regs, stacks, ctx) ||
+		identify_ist_machine_check(regs, stacks, ctx) ||
+		identify_ist_vmm_communication(regs, stacks, ctx);
+}
+
+/*
+ * identify if an interrupted atomic-IST-entry had successfully saved
+ * the general registers onto its IST stack.
+ *
+ * Generally, the outmost atomic-IST-entry had likely successfully saved
+ * the general registers.  If not, there must be one of the nested
+ * atomic-IST-entry had saved the general registers of the context that
+ * the outmost atomic-IST-entry had interrupted.
+ *
+ * Arguments:
+ *   @nested: the nested atomic-IST-entry who had interrupted @interrupted
+ *   @interrupted: the interrupted atomic-IST-entry.
+ *
+ * Returns:
+ *   true:  the interrupted atomic-IST-entry had saved the general registers.
+ *   false: the interrupted atomic-IST-entry had not yet saved the general registers.
+ */
+static __always_inline
+bool identify_if_gp_registers_saved(const struct pt_regs *nested, const struct pt_regs *interrupted)
+{
+	return nested->sp <= (unsigned long)(void *)interrupted;
+}
+
+static __always_inline
+void copy_regs_exception_head(struct pt_regs *target, const struct pt_regs *from)
+{
+	target->ss	= from->ss;
+	target->sp	= from->sp;
+	target->flags 	= from->flags;
+	target->cs	= from->cs;
+	target->ip	= from->ip;
+	target->orig_ax	= from->orig_ax;
+}
+
+static __always_inline
+void copy_regs_general_registers(struct pt_regs *target, const struct pt_regs *from)
+{
+	target->di  = from->di;
+	target->si  = from->si;
+	target->dx  = from->dx;
+	target->cx  = from->cx;
+	target->ax  = from->ax;
+	target->r8  = from->r8;
+	target->r9  = from->r9;
+	target->r10 = from->r10;
+	target->r11 = from->r11;
+	target->bx  = from->bx;
+	target->bp  = from->bp;
+	target->r12 = from->r12;
+	target->r13 = from->r13;
+	target->r14 = from->r14;
+	target->r15 = from->r15;
+}
+
+/*
+ * Do the work as the outmost atomic-IST-entry to copy the supposed pt_regs
+ * of the interrupted context to the IST main stack.  (If the ongoing
+ * atomic-IST-entry is the outmost one, the work is literally doing copy as
+ * the outmost, if not, the work is replicating the outmost.)
+ *
+ * The hardware-entry of the outmost atomic-IST-entry has already saved the
+ * exception head of the  pt_regs. If the outmost atomic-IST-entry was
+ * unfortunately interrupted before fully saving all the general registers,
+ * the general registers are untouched and must be saved by one of the
+ * consequent nested atomic-IST-entries. The identifying code can just
+ * examine all the nested atomic-IST-entries to find which one has saved
+ * the general registers.
+ */
+static __always_inline
+void copy_outmost(struct pt_regs *target, const struct pt_regs *outmost, const struct pt_regs *gp)
+{
+	copy_regs_exception_head(target, outmost);
+	copy_regs_general_registers(target, gp);
+}
+
+/*
+ * Replicate the interrupted atomic-IST-entry's CLAC and CLD in the ASM
+ * code.  Even SMAP is not enabled, CLAC is replicated unconditionally
+ * since doing so has no harm.
+ */
+static __always_inline void replicate_clac_cld(struct pt_regs *target)
+{
+	target->flags &= ~(unsigned long)(X86_EFLAGS_AC | X86_EFLAGS_DF);
+}
+
+/* Replicate the interrupted atomic-IST-entry's CLEAR_REGS macro. */
+static __always_inline void replicate_clear_regs(struct pt_regs *target)
+{
+	target->di  = 0;
+	target->si  = 0;
+	target->dx  = 0;
+	target->cx  = 0;
+	target->ax  = 0;
+	target->r8  = 0;
+	target->r9  = 0;
+	target->r10 = 0;
+	target->r11 = 0;
+	target->bx  = 0;
+	target->bp  = 0;
+	target->r12 = 0;
+	target->r13 = 0;
+	target->r14 = 0;
+	target->r15 = 0;
+}
+
+/*
+ * Replicate the action that the interrupted atomic-IST-entry's
+ * ist_copy_regs_to_main_stack() clobbers caller-saved registers
+ */
+static __always_inline void replicate_func_clobber(struct pt_regs *target)
+{
+	/* nothing needs to be done. */
+}
+
+/*
+ * Replicate the copy operation in the interrupted atomic-IST-entry's
+ * ist_copy_regs_to_main_stack()
+ */
+static __always_inline void replicate_func_copy(struct pt_regs *target)
+{
+	/*
+	 * To avoid recursive functions calls with __always_inline, the
+	 * copy operation for the interrupted atomic-IST-entry has been
+	 * done in the caller of copy_nested(). Nothing need to be done.
+	 */
+}
+
+#define IST_FRAME_SIZE	ALIGN(sizeof(struct pt_regs), 16)
+
+/*
+ * Replicate the return result of the interrupted atomic-IST-entry's
+ * ist_copy_regs_to_main_stack() in %rax and the commit operation.
+ */
+static __always_inline void replicate_func_result_and_commit(struct pt_regs *target, unsigned long commit_ip)
+{
+	void *target_of_interrupted = (void *)target + IST_FRAME_SIZE;
+
+	/* return result in %rax */
+	target->ax = (unsigned long)target_of_interrupted;
+	/* move %rax, %rsp */
+	target->sp = (unsigned long)target_of_interrupted;
+	/* the %rip advances to commit point */
+	target->ip = commit_ip;
+}
+
+/*
+ * Do the work as a nested atomic-IST-entry to copy the supposed pt_regs
+ * of the interrupted context to the IST main stack.
+ *
+ * The hardware-entry of the nested atomic-IST-entry has already saved
+ * the exception head of the pt_regs of the interrupted context (inside
+ * the interrupted atomic-IST-entry).  To maintain the atomic attribute
+ * of the atomic-IST-entry, the copy_nested() (of the ongoing nested
+ * atomic-IST-entry) has to replicate all that the interrupted
+ * atomic-IST-entries should have been done till the commit point and
+ * copy the supposed saved context (pt_regs).
+ *
+ * To avoid touching any saved pt_regs, the replicating is actually
+ * directly applied on the target pt_regs.
+ */
+static __always_inline
+void copy_nested(struct pt_regs *target, const struct pt_regs *nested, unsigned long commit_ip)
+{
+	copy_regs_exception_head(target, nested);
+	replicate_clac_cld(target);
+	replicate_clear_regs(target);
+	replicate_func_clobber(target);
+	replicate_func_copy(target);
+	replicate_func_result_and_commit(target, commit_ip);
+}
+
+asmlinkage __visible __noinstr_section(".entry.text")
+struct pt_regs *ist_copy_regs_to_main_stack(
+		const struct pt_regs *regs, struct cea_exception_stacks *stacks)
+{
+	unsigned long ist_main_sp = CEA_ESTACK_TOP(stacks, IST);
+	struct ist_ctx ist_ctx[8];
+	const struct pt_regs *gp_saved;
+	struct pt_regs *target;
+	int nr_entries, i;
+
+	/*
+	 * Identify all of the atomic-IST-entris.
+	 *
+	 * The current ongoing atomic-IST-entry doesn't need to be identified,
+	 * but is also put in the @ist_ctx[0] for later convenience.
+	 *
+	 * The for-loop identifies what the context @regs has interrupted is.
+	 * It travels back to the outmost atomic-IST-entry.
+	 *
+	 * Result:
+	 *   Identified result is put in ist_ctx[i].
+	 *   ist_ctx[0] is the current ongoing atomic-IST-entry.
+	 *   ist_ctx[nr_entries-1] is the outmost atomic-IST-entry.
+	 *   gp_saved is the atomic-IST-entry that has saved the general registers.
+	 */
+	ist_ctx[0].regs = regs;
+	ist_ctx[0].commit_ip = -1; /* unused */
+	nr_entries = 1;
+	gp_saved = regs;
+	for (;;) {
+		if (user_mode((struct pt_regs *)regs))
+			break;
+		if (ip_within_syscall_gap((struct pt_regs *)regs))
+			break;
+		if (!identify_ist(regs, stacks, &ist_ctx[nr_entries])) {
+			/* locate the top of copying target pt_regs */
+			if (IN_CEA_ESTACK(stacks, IST, regs->sp))
+				ist_main_sp = ALIGN_DOWN(regs->sp, 16);
+			break;
+		}
+		if (identify_if_gp_registers_saved(regs, ist_ctx[nr_entries].regs))
+			gp_saved = ist_ctx[nr_entries].regs;
+		regs = ist_ctx[nr_entries].regs;
+		nr_entries++;
+		if (nr_entries >= ARRAY_SIZE(ist_ctx))
+			ist_double_fault();
+	}
+
+	if (!IN_CEA_ESTACK(stacks, IST, ist_main_sp - IST_FRAME_SIZE * nr_entries))
+		ist_double_fault();
+
+	/*
+	 * Copy the saved pt_regs to the IST main stack.
+	 *
+	 * For each atomic-IST-entry including the interrupted ones and
+	 * the current ongoing one, calls either copy_outmost() or copy_nested()
+	 * to copy the pt_regs of what should have been saved, by replicating
+	 * if needed, to the IST main stack.
+	 */
+	ist_main_sp -= IST_FRAME_SIZE;
+	target = (void *)ist_main_sp;
+	copy_outmost(target, ist_ctx[nr_entries - 1].regs, gp_saved);
+	for (i = nr_entries - 2; unlikely(i >= 0); i--) {
+		ist_main_sp -= IST_FRAME_SIZE;
+		target = (void *)ist_main_sp;
+		copy_nested(target, ist_ctx[i].regs, ist_ctx[i+1].commit_ip);
+	}
+
+	return target;
+}
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index bb65371ea9df..f861a56c0002 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -60,5 +60,12 @@  int main(void)
 	OFFSET(FIXED_stack_canary, fixed_percpu_data, stack_canary);
 	BLANK();
 #endif
+
+	DEFINE(CEA_stacks_NMI, offsetofend(struct cea_exception_stacks, NMI_stack));
+	DEFINE(CEA_stacks_DB,  offsetofend(struct cea_exception_stacks, DB_stack));
+	DEFINE(CEA_stacks_MCE, offsetofend(struct cea_exception_stacks, MCE_stack));
+	DEFINE(CEA_stacks_VC,  offsetofend(struct cea_exception_stacks, VC_stack));
+	BLANK();
+
 	return 0;
 }
diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c
index ffea98f9064b..e756c89996d8 100644
--- a/arch/x86/kernel/callthunks.c
+++ b/arch/x86/kernel/callthunks.c
@@ -123,6 +123,8 @@  static bool skip_addr(void *dest)
 {
 	if (dest == error_entry)
 		return true;
+	if (dest == ist_copy_regs_to_main_stack)
+		return true;
 	if (dest == paranoid_entry)
 		return true;
 	if (dest == xen_error_entry)
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index f937be1afe65..8dfa627d4b41 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -3998,6 +3998,11 @@  static int validate_unret(struct objtool_file *file)
 	return warnings;
 }
 
+static bool in_ist_entry(struct instruction *insn)
+{
+	return !strcmp(insn->sym->name, "ist_copy_regs_to_main_stack");
+}
+
 static int validate_retpoline(struct objtool_file *file)
 {
 	struct instruction *insn;
@@ -4016,7 +4021,7 @@  static int validate_retpoline(struct objtool_file *file)
 			continue;
 
 		if (insn->type == INSN_RETURN) {
-			if (opts.rethunk) {
+			if (opts.rethunk && !in_ist_entry(insn)) {
 				WARN_FUNC("'naked' return found in RETHUNK build",
 					  insn->sec, insn->offset);
 			} else