diff mbox series

[v13,26/35] x86/fred: FRED entry/exit and dispatch code

Message ID 20231205105030.8698-27-xin3.li@intel.com (mailing list archive)
State New, archived
Headers show
Series x86: enable FRED for x86-64 | expand

Commit Message

Li, Xin3 Dec. 5, 2023, 10:50 a.m. UTC
From: "H. Peter Anvin (Intel)" <hpa@zytor.com>

The code to actually handle kernel and event entry/exit using
FRED. It is split up into two files thus:

- entry_64_fred.S contains the actual entrypoints and exit code, and
  saves and restores registers.
- entry_fred.c contains the two-level event dispatch code for FRED.
  The first-level dispatch is on the event type, and the second-level
  is on the event vector.

Originally-by: Megha Dey <megha.dey@intel.com>
Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Shan Kang <shan.kang@intel.com>
Co-developed-by: Xin Li <xin3.li@intel.com>
Signed-off-by: Xin Li <xin3.li@intel.com>
---

Changes since v10:
* Replace "IS_ENABLED(CONFIG_IA32_EMULATION)" with the new ia32_enabled()
  API (Nikolay Borisov).

Changes since v9:
* Don't use jump tables, indirect jumps are expensive (Thomas Gleixner).
* Except NMI/#DB/#MCE, FRED really can share the exception handlers
  with IDT (Thomas Gleixner).
* Avoid the sysvec_* idt_entry muck, do it at a central place, reuse code
  instead of blindly copying it, which breaks the performance optimized
  sysvec entries like reschedule_ipi (Thomas Gleixner).
* Add asm_ prefix to FRED asm entry points (Thomas Gleixner).

Changes since v8:
* Don't do syscall early out in fred_entry_from_user() before there are
  proper performance numbers and justifications (Thomas Gleixner).
* Add the control exception handler to the FRED exception handler table
  (Thomas Gleixner).
* Add ENDBR to the FRED_ENTER asm macro.
* Reflect the FRED spec 5.0 change that ERETS and ERETU add 8 to %rsp
  before popping the return context from the stack.

Changes since v1:
* Initialize a FRED exception handler to fred_bad_event() instead of NULL
  if no FRED handler defined for an exception vector (Peter Zijlstra).
* Push calling irqentry_{enter,exit}() and instrumentation_{begin,end}()
  down into individual FRED exception handlers, instead of in the dispatch
  framework (Peter Zijlstra).
---
 arch/x86/entry/Makefile               |   5 +-
 arch/x86/entry/entry_64_fred.S        |  52 ++++++
 arch/x86/entry/entry_fred.c           | 230 ++++++++++++++++++++++++++
 arch/x86/include/asm/asm-prototypes.h |   1 +
 arch/x86/include/asm/fred.h           |   6 +
 5 files changed, 293 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/entry/entry_64_fred.S
 create mode 100644 arch/x86/entry/entry_fred.c

Comments

Andrew Cooper Dec. 5, 2023, 12:25 p.m. UTC | #1
On 05/12/2023 10:50 am, Xin Li wrote:
> diff --git a/arch/x86/entry/entry_fred.c b/arch/x86/entry/entry_fred.c
> new file mode 100644
> index 000000000000..215883e90f94
> --- /dev/null
> +++ b/arch/x86/entry/entry_fred.c
> @@ -0,0 +1,230 @@
> ...
> +static noinstr void fred_intx(struct pt_regs *regs)
> +{
> +	switch (regs->fred_ss.vector) {
> +	/* INT0 */

INTO (for overflow), not INT-zero.  However...

> +	case X86_TRAP_OF:
> +		exc_overflow(regs);
> +		return;
> +
> +	/* INT3 */
> +	case X86_TRAP_BP:
> +		exc_int3(regs);
> +		return;

... neither OF nor BP will ever enter fred_intx() because they're type
SWEXC not SWINT.

SWINT is strictly the INT $imm8 instruction.

> ...
> +static noinstr void fred_extint(struct pt_regs *regs)
> +{
> +	unsigned int vector = regs->fred_ss.vector;
> +
> +	if (WARN_ON_ONCE(vector < FIRST_EXTERNAL_VECTOR))
> +		return;
> +
> +	if (likely(vector >= FIRST_SYSTEM_VECTOR)) {
> +		irqentry_state_t state = irqentry_enter(regs);
> +
> +		instrumentation_begin();
> +		sysvec_table[vector - FIRST_SYSTEM_VECTOR](regs);

array_index_mask_nospec()

This is easy for an attacker to abuse, to install non-function-pointer
targets into the indirect predictor.

> +		instrumentation_end();
> +		irqentry_exit(regs, state);
> +	} else {
> +		common_interrupt(regs, vector);
> +	}
> +}
> +
> +static noinstr void fred_exception(struct pt_regs *regs, unsigned long error_code)
> +{
> +	/* Optimize for #PF. That's the only exception which matters performance wise */
> +	if (likely(regs->fred_ss.vector == X86_TRAP_PF)) {
> +		exc_page_fault(regs, error_code);
> +		return;
> +	}
> +
> +	switch (regs->fred_ss.vector) {
> +	case X86_TRAP_DE: return exc_divide_error(regs);
> +	case X86_TRAP_DB: return fred_exc_debug(regs);
> +	case X86_TRAP_BP: return exc_int3(regs);
> +	case X86_TRAP_OF: return exc_overflow(regs);

Depending on what you want to do with BP/OF vs fred_intx(), this may
need adjusting.

If you are cross-checking type and vector, then these should be rejected
for not being of type HWEXC.

> +	case X86_TRAP_BR: return exc_bounds(regs);
> +	case X86_TRAP_UD: return exc_invalid_op(regs);
> +	case X86_TRAP_NM: return exc_device_not_available(regs);
> +	case X86_TRAP_DF: return exc_double_fault(regs, error_code);
> +	case X86_TRAP_TS: return exc_invalid_tss(regs, error_code);
> +	case X86_TRAP_NP: return exc_segment_not_present(regs, error_code);
> +	case X86_TRAP_SS: return exc_stack_segment(regs, error_code);
> +	case X86_TRAP_GP: return exc_general_protection(regs, error_code);
> +	case X86_TRAP_MF: return exc_coprocessor_error(regs);
> +	case X86_TRAP_AC: return exc_alignment_check(regs, error_code);
> +	case X86_TRAP_XF: return exc_simd_coprocessor_error(regs);
> +
> +#ifdef CONFIG_X86_MCE
> +	case X86_TRAP_MC: return fred_exc_machine_check(regs);
> +#endif
> +#ifdef CONFIG_INTEL_TDX_GUEST
> +	case X86_TRAP_VE: return exc_virtualization_exception(regs);
> +#endif
> +#ifdef CONFIG_X86_KERNEL_IBT

CONFIG_X86_CET

Userspace can use CET even if the kernel isn't compiled with IBT, so
this exception needs handling.

> +	case X86_TRAP_CP: return exc_control_protection(regs, error_code);
> +#endif
> +	default: return fred_bad_type(regs, error_code);
> +	}
> +}
> +
> +__visible noinstr void fred_entry_from_user(struct pt_regs *regs)
> +{
> +	unsigned long error_code = regs->orig_ax;
> +
> +	/* Invalidate orig_ax so that syscall_get_nr() works correctly */
> +	regs->orig_ax = -1;
> +
> +	switch (regs->fred_ss.type) {
> +	case EVENT_TYPE_EXTINT:
> +		return fred_extint(regs);
> +	case EVENT_TYPE_NMI:
> +		return fred_exc_nmi(regs);
> +	case EVENT_TYPE_SWINT:
> +		return fred_intx(regs);
> +	case EVENT_TYPE_HWEXC:
> +	case EVENT_TYPE_SWEXC:
> +	case EVENT_TYPE_PRIV_SWEXC:
> +		return fred_exception(regs, error_code);

PRIV_SWEXC should have it's own function and not fall into fred_exception().

It is strictly only the ICEBP (INT1) instruction at the moment, so
should fall into bad_type() for any vector other than X86_TRAP_DB.

> +	case EVENT_TYPE_OTHER:
> +		return fred_other(regs);
> +	default:
> +		return fred_bad_type(regs, error_code);
> +	}
> +}

~Andrew
Li, Xin3 Dec. 5, 2023, 7:03 p.m. UTC | #2
> > +static noinstr void fred_intx(struct pt_regs *regs) {
> > +	switch (regs->fred_ss.vector) {
> > +	/* INT0 */
> 
> INTO (for overflow), not INT-zero.  However...
> 
> > +	case X86_TRAP_OF:
> > +		exc_overflow(regs);
> > +		return;
> > +
> > +	/* INT3 */
> > +	case X86_TRAP_BP:
> > +		exc_int3(regs);
> > +		return;
> 
> ... neither OF nor BP will ever enter fred_intx() because they're type SWEXC not
> SWINT.
> 
> SWINT is strictly the INT $imm8 instruction.
> 
> > ...
> > +static noinstr void fred_extint(struct pt_regs *regs) {
> > +	unsigned int vector = regs->fred_ss.vector;
> > +
> > +	if (WARN_ON_ONCE(vector < FIRST_EXTERNAL_VECTOR))
> > +		return;
> > +
> > +	if (likely(vector >= FIRST_SYSTEM_VECTOR)) {
> > +		irqentry_state_t state = irqentry_enter(regs);
> > +
> > +		instrumentation_begin();
> > +		sysvec_table[vector - FIRST_SYSTEM_VECTOR](regs);
> 
> array_index_mask_nospec()
> 
> This is easy for an attacker to abuse, to install non-function-pointer targets into
> the indirect predictor.
> 
> > +		instrumentation_end();
> > +		irqentry_exit(regs, state);
> > +	} else {
> > +		common_interrupt(regs, vector);
> > +	}
> > +}
> > +
> > +static noinstr void fred_exception(struct pt_regs *regs, unsigned
> > +long error_code) {
> > +	/* Optimize for #PF. That's the only exception which matters performance
> wise */
> > +	if (likely(regs->fred_ss.vector == X86_TRAP_PF)) {
> > +		exc_page_fault(regs, error_code);
> > +		return;
> > +	}
> > +
> > +	switch (regs->fred_ss.vector) {
> > +	case X86_TRAP_DE: return exc_divide_error(regs);
> > +	case X86_TRAP_DB: return fred_exc_debug(regs);
> > +	case X86_TRAP_BP: return exc_int3(regs);
> > +	case X86_TRAP_OF: return exc_overflow(regs);
> 
> Depending on what you want to do with BP/OF vs fred_intx(), this may need
> adjusting.
> 
> If you are cross-checking type and vector, then these should be rejected for not
> being of type HWEXC.
> 
> > +	case X86_TRAP_BR: return exc_bounds(regs);
> > +	case X86_TRAP_UD: return exc_invalid_op(regs);
> > +	case X86_TRAP_NM: return exc_device_not_available(regs);
> > +	case X86_TRAP_DF: return exc_double_fault(regs, error_code);
> > +	case X86_TRAP_TS: return exc_invalid_tss(regs, error_code);
> > +	case X86_TRAP_NP: return exc_segment_not_present(regs, error_code);
> > +	case X86_TRAP_SS: return exc_stack_segment(regs, error_code);
> > +	case X86_TRAP_GP: return exc_general_protection(regs, error_code);
> > +	case X86_TRAP_MF: return exc_coprocessor_error(regs);
> > +	case X86_TRAP_AC: return exc_alignment_check(regs, error_code);
> > +	case X86_TRAP_XF: return exc_simd_coprocessor_error(regs);
> > +
> > +#ifdef CONFIG_X86_MCE
> > +	case X86_TRAP_MC: return fred_exc_machine_check(regs); #endif #ifdef
> > +CONFIG_INTEL_TDX_GUEST
> > +	case X86_TRAP_VE: return exc_virtualization_exception(regs);
> > +#endif
> > +#ifdef CONFIG_X86_KERNEL_IBT
> 
> CONFIG_X86_CET
> 
> Userspace can use CET even if the kernel isn't compiled with IBT, so this
> exception needs handling.
> 
> > +	case X86_TRAP_CP: return exc_control_protection(regs, error_code);
> > +#endif
> > +	default: return fred_bad_type(regs, error_code);
> > +	}
> > +}
> > +
> > +__visible noinstr void fred_entry_from_user(struct pt_regs *regs) {
> > +	unsigned long error_code = regs->orig_ax;
> > +
> > +	/* Invalidate orig_ax so that syscall_get_nr() works correctly */
> > +	regs->orig_ax = -1;
> > +
> > +	switch (regs->fred_ss.type) {
> > +	case EVENT_TYPE_EXTINT:
> > +		return fred_extint(regs);
> > +	case EVENT_TYPE_NMI:
> > +		return fred_exc_nmi(regs);
> > +	case EVENT_TYPE_SWINT:
> > +		return fred_intx(regs);
> > +	case EVENT_TYPE_HWEXC:
> > +	case EVENT_TYPE_SWEXC:
> > +	case EVENT_TYPE_PRIV_SWEXC:
> > +		return fred_exception(regs, error_code);
> 
> PRIV_SWEXC should have it's own function and not fall into fred_exception().
> 
> It is strictly only the ICEBP (INT1) instruction at the moment, so should fall into
> bad_type() for any vector other than X86_TRAP_DB.
> 
> > +	case EVENT_TYPE_OTHER:
> > +		return fred_other(regs);
> > +	default:
> > +		return fred_bad_type(regs, error_code);
> > +	}
> > +}
> 
> ~Andrew


Thanks a lot for your quick review, will address soon.
    Xin
Li, Xin3 Dec. 6, 2023, 7:45 a.m. UTC | #3
> > diff --git a/arch/x86/entry/entry_fred.c b/arch/x86/entry/entry_fred.c
> > new file mode 100644 index 000000000000..215883e90f94
> > --- /dev/null
> > +++ b/arch/x86/entry/entry_fred.c
> > @@ -0,0 +1,230 @@
> > ...
> > +static noinstr void fred_intx(struct pt_regs *regs) {
> > +	switch (regs->fred_ss.vector) {
> > +	/* INT0 */
> 
> INTO (for overflow), not INT-zero.  However...

My bad again...

> > +	case X86_TRAP_OF:
> > +		exc_overflow(regs);
> > +		return;
> > +
> > +	/* INT3 */
> > +	case X86_TRAP_BP:
> > +		exc_int3(regs);
> > +		return;
> 
> ... neither OF nor BP will ever enter fred_intx() because they're type SWEXC not
> SWINT.

Per FRED spec 5.0, section 7.3 Software Interrupts and Related Instructions:
INT n (opcode CD followed by an immediate byte): There are 256 such
software interrupt instructions, one for each value n of the immediate
byte (0–255).

And appendix B Event Stack Levels:
If the event is an execution of INT n (opcode CD n for 8-bit value n),
the event stack level is 0. The event type is 4 (software interrupt)
and the vector is n.

So int $0x4 and int $0x3 (use asm(".byte 0xCD, 0x03")) get here.

But into (0xCE) and int3 (0xCC) do use event type SWEXC. 

BTW, into is NOT allowed in 64-bit mode but "int $0x4" is allowed.

> 
> SWINT is strictly the INT $imm8 instruction.
> 
> > ...
> > +static noinstr void fred_extint(struct pt_regs *regs) {
> > +	unsigned int vector = regs->fred_ss.vector;
> > +
> > +	if (WARN_ON_ONCE(vector < FIRST_EXTERNAL_VECTOR))
> > +		return;
> > +
> > +	if (likely(vector >= FIRST_SYSTEM_VECTOR)) {
> > +		irqentry_state_t state = irqentry_enter(regs);
> > +
> > +		instrumentation_begin();
> > +		sysvec_table[vector - FIRST_SYSTEM_VECTOR](regs);
> 
> array_index_mask_nospec()
> 
> This is easy for an attacker to abuse, to install non-function-pointer targets into
> the indirect predictor.

HPA did use array_index_nospec() at the beginning, but I forgot it later.

> 
> > +		instrumentation_end();
> > +		irqentry_exit(regs, state);
> > +	} else {
> > +		common_interrupt(regs, vector);
> > +	}
> > +}
> > +
> > +static noinstr void fred_exception(struct pt_regs *regs, unsigned
> > +long error_code) {
> > +	/* Optimize for #PF. That's the only exception which matters performance
> wise */
> > +	if (likely(regs->fred_ss.vector == X86_TRAP_PF)) {
> > +		exc_page_fault(regs, error_code);
> > +		return;
> > +	}
> > +
> > +	switch (regs->fred_ss.vector) {
> > +	case X86_TRAP_DE: return exc_divide_error(regs);
> > +	case X86_TRAP_DB: return fred_exc_debug(regs);
> > +	case X86_TRAP_BP: return exc_int3(regs);
> > +	case X86_TRAP_OF: return exc_overflow(regs);
> 
> Depending on what you want to do with BP/OF vs fred_intx(), this may need
> adjusting.
> 
> If you are cross-checking type and vector, then these should be rejected for not
> being of type HWEXC.

You're right, the event type needs to be SWEXC for into and int3.

However, would it be overkilling?  Assuming hardware and VMM are sane.

> 
> > +	case X86_TRAP_BR: return exc_bounds(regs);
> > +	case X86_TRAP_UD: return exc_invalid_op(regs);
> > +	case X86_TRAP_NM: return exc_device_not_available(regs);
> > +	case X86_TRAP_DF: return exc_double_fault(regs, error_code);
> > +	case X86_TRAP_TS: return exc_invalid_tss(regs, error_code);
> > +	case X86_TRAP_NP: return exc_segment_not_present(regs, error_code);
> > +	case X86_TRAP_SS: return exc_stack_segment(regs, error_code);
> > +	case X86_TRAP_GP: return exc_general_protection(regs, error_code);
> > +	case X86_TRAP_MF: return exc_coprocessor_error(regs);
> > +	case X86_TRAP_AC: return exc_alignment_check(regs, error_code);
> > +	case X86_TRAP_XF: return exc_simd_coprocessor_error(regs);
> > +
> > +#ifdef CONFIG_X86_MCE
> > +	case X86_TRAP_MC: return fred_exc_machine_check(regs); #endif #ifdef
> > +CONFIG_INTEL_TDX_GUEST
> > +	case X86_TRAP_VE: return exc_virtualization_exception(regs);
> > +#endif
> > +#ifdef CONFIG_X86_KERNEL_IBT
> 
> CONFIG_X86_CET
> 
> Userspace can use CET even if the kernel isn't compiled with IBT, so this
> exception needs handling.

Absolutely correct!

> 
> > +	case X86_TRAP_CP: return exc_control_protection(regs, error_code);
> > +#endif
> > +	default: return fred_bad_type(regs, error_code);
> > +	}
> > +}
> > +
> > +__visible noinstr void fred_entry_from_user(struct pt_regs *regs) {
> > +	unsigned long error_code = regs->orig_ax;
> > +
> > +	/* Invalidate orig_ax so that syscall_get_nr() works correctly */
> > +	regs->orig_ax = -1;
> > +
> > +	switch (regs->fred_ss.type) {
> > +	case EVENT_TYPE_EXTINT:
> > +		return fred_extint(regs);
> > +	case EVENT_TYPE_NMI:
> > +		return fred_exc_nmi(regs);
> > +	case EVENT_TYPE_SWINT:
> > +		return fred_intx(regs);
> > +	case EVENT_TYPE_HWEXC:
> > +	case EVENT_TYPE_SWEXC:
> > +	case EVENT_TYPE_PRIV_SWEXC:
> > +		return fred_exception(regs, error_code);
> 
> PRIV_SWEXC should have it's own function and not fall into fred_exception().
> 
> It is strictly only the ICEBP (INT1) instruction at the moment, so should fall into
> bad_type() for any vector other than X86_TRAP_DB.

Good point!

It's like NMI, one event type with only one valid event vector now.

> 
> > +	case EVENT_TYPE_OTHER:
> > +		return fred_other(regs);
> > +	default:
> > +		return fred_bad_type(regs, error_code);
> > +	}
> > +}
> 
> ~Andrew

Thanks!
    Xin
Andrew Cooper Dec. 6, 2023, 2:11 p.m. UTC | #4
On 06/12/2023 7:45 am, Li, Xin3 wrote:
>>> +	case X86_TRAP_OF:
>>> +		exc_overflow(regs);
>>> +		return;
>>> +
>>> +	/* INT3 */
>>> +	case X86_TRAP_BP:
>>> +		exc_int3(regs);
>>> +		return;
>> ... neither OF nor BP will ever enter fred_intx() because they're type SWEXC not
>> SWINT.
> Per FRED spec 5.0, section 7.3 Software Interrupts and Related Instructions:
> INT n (opcode CD followed by an immediate byte): There are 256 such
> software interrupt instructions, one for each value n of the immediate
> byte (0–255).
>
> And appendix B Event Stack Levels:
> If the event is an execution of INT n (opcode CD n for 8-bit value n),
> the event stack level is 0. The event type is 4 (software interrupt)
> and the vector is n.
>
> So int $0x4 and int $0x3 (use asm(".byte 0xCD, 0x03")) get here.
>
> But into (0xCE) and int3 (0xCC) do use event type SWEXC. 
>
> BTW, into is NOT allowed in 64-bit mode but "int $0x4" is allowed.

There is certainly fun to be had with CD 03 and CD 04 byte patterns, but
if you meant to mean those here, then the comments are wrong.

Vectors 3 and 4 are installed with DPL3 because that is necessary to
make CC and CE function in userspace.  It also suggests that the SWINT
vs SWEXC distinction was retrofitted to architecture after the 286,
because exceptions don't check DPL and ICEBP delivers #DB from userspace
even when Vector 1 has a DPL of 0.

While CC is for most cases indistinguishable from CD 03, CE behaves
entirely differently to CD 04.  CD 04 doesn't #UD in 64bit mode, and
will trigger exc_overflow() irrespective of the state of EFLAGS.OF.


The SDM goes out of it's way to say not to use the CD 03 byte pattern
(and it does take effort to emit this byte pattern - e.g. GAS will
silently translate "int $3" to "int3"), and there's no plausible way
software is using CD 04 in place of CE.

So why do we care about containing to make mistakes of the IDT era work
in a FRED world?

Is there anything (other than perhaps the selftests) which would even
notice?

>>> +		instrumentation_end();
>>> +		irqentry_exit(regs, state);
>>> +	} else {
>>> +		common_interrupt(regs, vector);
>>> +	}
>>> +}
>>> +
>>> +static noinstr void fred_exception(struct pt_regs *regs, unsigned
>>> +long error_code) {
>>> +	/* Optimize for #PF. That's the only exception which matters performance
>> wise */
>>> +	if (likely(regs->fred_ss.vector == X86_TRAP_PF)) {
>>> +		exc_page_fault(regs, error_code);
>>> +		return;
>>> +	}
>>> +
>>> +	switch (regs->fred_ss.vector) {
>>> +	case X86_TRAP_DE: return exc_divide_error(regs);
>>> +	case X86_TRAP_DB: return fred_exc_debug(regs);
>>> +	case X86_TRAP_BP: return exc_int3(regs);
>>> +	case X86_TRAP_OF: return exc_overflow(regs);
>> Depending on what you want to do with BP/OF vs fred_intx(), this may need
>> adjusting.
>>
>> If you are cross-checking type and vector, then these should be rejected for not
>> being of type HWEXC.
> You're right, the event type needs to be SWEXC for into and int3.
>
> However, would it be overkilling?  Assuming hardware and VMM are sane.

You either care about cross checking, or not.  Right now, this patch is
a mix of the two approaches.

In my opinion, cross-checking is the better approach, because it means
that violations of the assumptions get noticed more quickly, and
hopefully by whomever is working on the new feature which alters the
assumptions.

~Andrew
Li, Xin3 Dec. 6, 2023, 7:19 p.m. UTC | #5
> >>> +	case X86_TRAP_OF:
> >>> +		exc_overflow(regs);
> >>> +		return;
> >>> +
> >>> +	/* INT3 */
> >>> +	case X86_TRAP_BP:
> >>> +		exc_int3(regs);
> >>> +		return;
> >> ... neither OF nor BP will ever enter fred_intx() because they're
> >> type SWEXC not SWINT.
> > Per FRED spec 5.0, section 7.3 Software Interrupts and Related Instructions:
> > INT n (opcode CD followed by an immediate byte): There are 256 such
> > software interrupt instructions, one for each value n of the immediate
> > byte (0–255).
> >
> > And appendix B Event Stack Levels:
> > If the event is an execution of INT n (opcode CD n for 8-bit value n),
> > the event stack level is 0. The event type is 4 (software interrupt)
> > and the vector is n.
> >
> > So int $0x4 and int $0x3 (use asm(".byte 0xCD, 0x03")) get here.
> >
> > But into (0xCE) and int3 (0xCC) do use event type SWEXC.
> >
> > BTW, into is NOT allowed in 64-bit mode but "int $0x4" is allowed.
> 
> There is certainly fun to be had with CD 03 and CD 04 byte patterns, but if you
> meant to mean those here, then the comments are wrong.
> 
> Vectors 3 and 4 are installed with DPL3 because that is necessary to make CC and
> CE function in userspace.  It also suggests that the SWINT vs SWEXC distinction
> was retrofitted to architecture after the 286, because exceptions don't check DPL
> and ICEBP delivers #DB from userspace even when Vector 1 has a DPL of 0.
> 
> While CC is for most cases indistinguishable from CD 03, CE behaves entirely
> differently to CD 04.  CD 04 doesn't #UD in 64bit mode, and will trigger
> exc_overflow() irrespective of the state of EFLAGS.OF.
> 
> 
> The SDM goes out of it's way to say not to use the CD 03 byte pattern (and it
> does take effort to emit this byte pattern - e.g. GAS will silently translate "int $3"
> to "int3"), and there's no plausible way software is using CD 04 in place of CE.
> 
> So why do we care about containing to make mistakes of the IDT era work in a
> FRED world?

First, I agree with you because it makes things simple and neat.

However, the latest SDM and FRED spec 5.0 both doesn't disallow it, so it
becomes an OS implementation choice.

> 
> Is there anything (other than perhaps the selftests) which would even notice?

I'm just conservative :)

If a user app can do it with IDT, we should still allow it when FRED is
enabled.  But if all key stakeholders don't care whatever gets broken
due to the change and agree to change it.

> >>> +		instrumentation_end();
> >>> +		irqentry_exit(regs, state);
> >>> +	} else {
> >>> +		common_interrupt(regs, vector);
> >>> +	}
> >>> +}
> >>> +
> >>> +static noinstr void fred_exception(struct pt_regs *regs, unsigned
> >>> +long error_code) {
> >>> +	/* Optimize for #PF. That's the only exception which matters
> >>> +performance
> >> wise */
> >>> +	if (likely(regs->fred_ss.vector == X86_TRAP_PF)) {
> >>> +		exc_page_fault(regs, error_code);
> >>> +		return;
> >>> +	}
> >>> +
> >>> +	switch (regs->fred_ss.vector) {
> >>> +	case X86_TRAP_DE: return exc_divide_error(regs);
> >>> +	case X86_TRAP_DB: return fred_exc_debug(regs);
> >>> +	case X86_TRAP_BP: return exc_int3(regs);
> >>> +	case X86_TRAP_OF: return exc_overflow(regs);
> >> Depending on what you want to do with BP/OF vs fred_intx(), this may
> >> need adjusting.
> >>
> >> If you are cross-checking type and vector, then these should be
> >> rejected for not being of type HWEXC.
> > You're right, the event type needs to be SWEXC for into and int3.
> >
> > However, would it be overkilling?  Assuming hardware and VMM are sane.
> 
> You either care about cross checking, or not.  Right now, this patch is a mix of the
> two approaches.
> 
> In my opinion, cross-checking is the better approach, because it means that
> violations of the assumptions get noticed more quickly, and hopefully by
> whomever is working on the new feature which alters the assumptions.

Yeah, I can make the change.

Thanks!
    Xin
H. Peter Anvin Dec. 6, 2023, 7:26 p.m. UTC | #6
On December 6, 2023 11:19:26 AM PST, "Li, Xin3" <xin3.li@intel.com> wrote:
>> >>> +	case X86_TRAP_OF:
>> >>> +		exc_overflow(regs);
>> >>> +		return;
>> >>> +
>> >>> +	/* INT3 */
>> >>> +	case X86_TRAP_BP:
>> >>> +		exc_int3(regs);
>> >>> +		return;
>> >> ... neither OF nor BP will ever enter fred_intx() because they're
>> >> type SWEXC not SWINT.
>> > Per FRED spec 5.0, section 7.3 Software Interrupts and Related Instructions:
>> > INT n (opcode CD followed by an immediate byte): There are 256 such
>> > software interrupt instructions, one for each value n of the immediate
>> > byte (0–255).
>> >
>> > And appendix B Event Stack Levels:
>> > If the event is an execution of INT n (opcode CD n for 8-bit value n),
>> > the event stack level is 0. The event type is 4 (software interrupt)
>> > and the vector is n.
>> >
>> > So int $0x4 and int $0x3 (use asm(".byte 0xCD, 0x03")) get here.
>> >
>> > But into (0xCE) and int3 (0xCC) do use event type SWEXC.
>> >
>> > BTW, into is NOT allowed in 64-bit mode but "int $0x4" is allowed.
>> 
>> There is certainly fun to be had with CD 03 and CD 04 byte patterns, but if you
>> meant to mean those here, then the comments are wrong.
>> 
>> Vectors 3 and 4 are installed with DPL3 because that is necessary to make CC and
>> CE function in userspace.  It also suggests that the SWINT vs SWEXC distinction
>> was retrofitted to architecture after the 286, because exceptions don't check DPL
>> and ICEBP delivers #DB from userspace even when Vector 1 has a DPL of 0.
>> 
>> While CC is for most cases indistinguishable from CD 03, CE behaves entirely
>> differently to CD 04.  CD 04 doesn't #UD in 64bit mode, and will trigger
>> exc_overflow() irrespective of the state of EFLAGS.OF.
>> 
>> 
>> The SDM goes out of it's way to say not to use the CD 03 byte pattern (and it
>> does take effort to emit this byte pattern - e.g. GAS will silently translate "int $3"
>> to "int3"), and there's no plausible way software is using CD 04 in place of CE.
>> 
>> So why do we care about containing to make mistakes of the IDT era work in a
>> FRED world?
>
>First, I agree with you because it makes things simple and neat.
>
>However, the latest SDM and FRED spec 5.0 both doesn't disallow it, so it
>becomes an OS implementation choice.
>
>> 
>> Is there anything (other than perhaps the selftests) which would even notice?
>
>I'm just conservative :)
>
>If a user app can do it with IDT, we should still allow it when FRED is
>enabled.  But if all key stakeholders don't care whatever gets broken
>due to the change and agree to change it.
>
>> >>> +		instrumentation_end();
>> >>> +		irqentry_exit(regs, state);
>> >>> +	} else {
>> >>> +		common_interrupt(regs, vector);
>> >>> +	}
>> >>> +}
>> >>> +
>> >>> +static noinstr void fred_exception(struct pt_regs *regs, unsigned
>> >>> +long error_code) {
>> >>> +	/* Optimize for #PF. That's the only exception which matters
>> >>> +performance
>> >> wise */
>> >>> +	if (likely(regs->fred_ss.vector == X86_TRAP_PF)) {
>> >>> +		exc_page_fault(regs, error_code);
>> >>> +		return;
>> >>> +	}
>> >>> +
>> >>> +	switch (regs->fred_ss.vector) {
>> >>> +	case X86_TRAP_DE: return exc_divide_error(regs);
>> >>> +	case X86_TRAP_DB: return fred_exc_debug(regs);
>> >>> +	case X86_TRAP_BP: return exc_int3(regs);
>> >>> +	case X86_TRAP_OF: return exc_overflow(regs);
>> >> Depending on what you want to do with BP/OF vs fred_intx(), this may
>> >> need adjusting.
>> >>
>> >> If you are cross-checking type and vector, then these should be
>> >> rejected for not being of type HWEXC.
>> > You're right, the event type needs to be SWEXC for into and int3.
>> >
>> > However, would it be overkilling?  Assuming hardware and VMM are sane.
>> 
>> You either care about cross checking, or not.  Right now, this patch is a mix of the
>> two approaches.
>> 
>> In my opinion, cross-checking is the better approach, because it means that
>> violations of the assumptions get noticed more quickly, and hopefully by
>> whomever is working on the new feature which alters the assumptions.
>
>Yeah, I can make the change.
>
>Thanks!
>    Xin
>

The intent is to not break userspace even if userspace does something fundamentally stupid.
Brian Gerst Dec. 6, 2023, 7:58 p.m. UTC | #7
On Wed, Dec 6, 2023 at 2:19 PM Li, Xin3 <xin3.li@intel.com> wrote:
>
> > >>> + case X86_TRAP_OF:
> > >>> +         exc_overflow(regs);
> > >>> +         return;
> > >>> +
> > >>> + /* INT3 */
> > >>> + case X86_TRAP_BP:
> > >>> +         exc_int3(regs);
> > >>> +         return;
> > >> ... neither OF nor BP will ever enter fred_intx() because they're
> > >> type SWEXC not SWINT.
> > > Per FRED spec 5.0, section 7.3 Software Interrupts and Related Instructions:
> > > INT n (opcode CD followed by an immediate byte): There are 256 such
> > > software interrupt instructions, one for each value n of the immediate
> > > byte (0–255).
> > >
> > > And appendix B Event Stack Levels:
> > > If the event is an execution of INT n (opcode CD n for 8-bit value n),
> > > the event stack level is 0. The event type is 4 (software interrupt)
> > > and the vector is n.
> > >
> > > So int $0x4 and int $0x3 (use asm(".byte 0xCD, 0x03")) get here.
> > >
> > > But into (0xCE) and int3 (0xCC) do use event type SWEXC.
> > >
> > > BTW, into is NOT allowed in 64-bit mode but "int $0x4" is allowed.
> >
> > There is certainly fun to be had with CD 03 and CD 04 byte patterns, but if you
> > meant to mean those here, then the comments are wrong.
> >
> > Vectors 3 and 4 are installed with DPL3 because that is necessary to make CC and
> > CE function in userspace.  It also suggests that the SWINT vs SWEXC distinction
> > was retrofitted to architecture after the 286, because exceptions don't check DPL
> > and ICEBP delivers #DB from userspace even when Vector 1 has a DPL of 0.
> >
> > While CC is for most cases indistinguishable from CD 03, CE behaves entirely
> > differently to CD 04.  CD 04 doesn't #UD in 64bit mode, and will trigger
> > exc_overflow() irrespective of the state of EFLAGS.OF.
> >
> >
> > The SDM goes out of it's way to say not to use the CD 03 byte pattern (and it
> > does take effort to emit this byte pattern - e.g. GAS will silently translate "int $3"
> > to "int3"), and there's no plausible way software is using CD 04 in place of CE.
> >
> > So why do we care about containing to make mistakes of the IDT era work in a
> > FRED world?
>
> First, I agree with you because it makes things simple and neat.
>
> However, the latest SDM and FRED spec 5.0 both doesn't disallow it, so it
> becomes an OS implementation choice.
>
> >
> > Is there anything (other than perhaps the selftests) which would even notice?
>
> I'm just conservative :)
>
> If a user app can do it with IDT, we should still allow it when FRED is
> enabled.  But if all key stakeholders don't care whatever gets broken
> due to the change and agree to change it.

One case to consider is Windows software running under Wine.
Anti-tampering code has been known to do some non-standard things,
like using ICEBP or using SYSCALL directly instead of through system
DLLs.  Keeping the status quo should be preferred, especially if
Microsoft does the same.


Brian Gerst
Li, Xin3 Dec. 7, 2023, 9:43 a.m. UTC | #8
> > In my opinion, cross-checking is the better approach, because it means that
> > violations of the assumptions get noticed more quickly, and hopefully by
> > whomever is working on the new feature which alters the assumptions.
> 
> Yeah, I can make the change.
 

Hi Andrew,

Following is the updated patch, can you please have another review?

Thanks!
    Xin


diff --git a/arch/x86/entry/entry_fred.c b/arch/x86/entry/entry_fred.c
new file mode 100644
index 000000000000..fd36fb8d2a19
--- /dev/null
+++ b/arch/x86/entry/entry_fred.c
@@ -0,0 +1,252 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * The FRED specific kernel/user entry functions which are invoked from
+ * assembly code and dispatch to the associated handlers.
+ */
+#include <linux/kernel.h>
+#include <linux/kdebug.h>
+#include <linux/nospec.h>
+
+#include <asm/desc.h>
+#include <asm/fred.h>
+#include <asm/idtentry.h>
+#include <asm/syscall.h>
+#include <asm/trapnr.h>
+#include <asm/traps.h>
+
+/* FRED EVENT_TYPE_OTHER vector numbers */
+#define FRED_SYSCALL			1
+#define FRED_SYSENTER			2
+
+static noinstr void fred_bad_type(struct pt_regs *regs, unsigned long error_code)
+{
+	irqentry_state_t irq_state = irqentry_nmi_enter(regs);
+
+	instrumentation_begin();
+
+	/* Panic on events from a high stack level */
+	if (regs->fred_cs.sl > 0) {
+		pr_emerg("PANIC: invalid or fatal FRED event; event type %u "
+			 "vector %u error 0x%lx aux 0x%lx at %04x:%016lx\n",
+			 regs->fred_ss.type, regs->fred_ss.vector, regs->orig_ax,
+			 fred_event_data(regs), regs->cs, regs->ip);
+		die("invalid or fatal FRED event", regs, regs->orig_ax);
+		panic("invalid or fatal FRED event");
+	} else {
+		unsigned long flags = oops_begin();
+		int sig = SIGKILL;
+
+		pr_alert("BUG: invalid or fatal FRED event; event type %u "
+			 "vector %u error 0x%lx aux 0x%lx at %04x:%016lx\n",
+			 regs->fred_ss.type, regs->fred_ss.vector, regs->orig_ax,
+			 fred_event_data(regs), regs->cs, regs->ip);
+
+		if (__die("Invalid or fatal FRED event", regs, regs->orig_ax))
+			sig = 0;
+
+		oops_end(flags, regs, sig);
+	}
+
+	instrumentation_end();
+	irqentry_nmi_exit(regs, irq_state);
+}
+
+static noinstr void fred_intx(struct pt_regs *regs)
+{
+	switch (regs->fred_ss.vector) {
+	/* Opcode 0xcd, 0x3, NOT INT3 (opcode 0xcc) */
+	case X86_TRAP_BP:
+		exc_int3(regs);
+		return;
+
+	/* Opcode 0xcd, 0x4, NOT INTO (opcode 0xce) */
+	case X86_TRAP_OF:
+		exc_overflow(regs);
+		return;
+
+	/* INT80 */
+	case IA32_SYSCALL_VECTOR:
+		if (ia32_enabled()) {
+			/* Save the syscall number */
+			regs->orig_ax = regs->ax;
+			regs->ax = -ENOSYS;
+			do_int80_syscall_32(regs);
+			return;
+		}
+		fallthrough;
+
+	default:
+		exc_general_protection(regs, 0);
+		return;
+	}
+}
+
+static __always_inline void fred_other(struct pt_regs *regs)
+{
+	/* The compiler can fold these conditions into a single test */
+	if (likely(regs->fred_ss.vector == FRED_SYSCALL && regs->fred_ss.lm)) {
+		regs->orig_ax = regs->ax;
+		regs->ax = -ENOSYS;
+		do_syscall_64(regs, regs->orig_ax);
+		return;
+	} else if (ia32_enabled() &&
+		   likely(regs->fred_ss.vector == FRED_SYSENTER &&
+			  !regs->fred_ss.lm)) {
+		regs->orig_ax = regs->ax;
+		regs->ax = -ENOSYS;
+		do_fast_syscall_32(regs);
+		return;
+	} else {
+		exc_invalid_op(regs);
+		return;
+	}
+}
+
+#define SYSVEC(_vector, _function) [_vector - FIRST_SYSTEM_VECTOR] = fred_sysvec_##_function
+
+static idtentry_t sysvec_table[NR_SYSTEM_VECTORS] __ro_after_init = {
+	SYSVEC(ERROR_APIC_VECTOR,		error_interrupt),
+	SYSVEC(SPURIOUS_APIC_VECTOR,		spurious_apic_interrupt),
+	SYSVEC(LOCAL_TIMER_VECTOR,		apic_timer_interrupt),
+	SYSVEC(X86_PLATFORM_IPI_VECTOR,		x86_platform_ipi),
+
+	SYSVEC(RESCHEDULE_VECTOR,		reschedule_ipi),
+	SYSVEC(CALL_FUNCTION_SINGLE_VECTOR,	call_function_single),
+	SYSVEC(CALL_FUNCTION_VECTOR,		call_function),
+	SYSVEC(REBOOT_VECTOR,			reboot),
+
+	SYSVEC(THRESHOLD_APIC_VECTOR,		threshold),
+	SYSVEC(DEFERRED_ERROR_VECTOR,		deferred_error),
+	SYSVEC(THERMAL_APIC_VECTOR,		thermal),
+
+	SYSVEC(IRQ_WORK_VECTOR,			irq_work),
+
+	SYSVEC(POSTED_INTR_VECTOR,		kvm_posted_intr_ipi),
+	SYSVEC(POSTED_INTR_WAKEUP_VECTOR,	kvm_posted_intr_wakeup_ipi),
+	SYSVEC(POSTED_INTR_NESTED_VECTOR,	kvm_posted_intr_nested_ipi),
+};
+
+static noinstr void fred_extint(struct pt_regs *regs)
+{
+	unsigned int vector = regs->fred_ss.vector;
+	unsigned int index = array_index_nospec(vector - FIRST_SYSTEM_VECTOR,
+						NR_SYSTEM_VECTORS);
+
+	if (WARN_ON_ONCE(vector < FIRST_EXTERNAL_VECTOR))
+		return;
+
+	if (likely(vector >= FIRST_SYSTEM_VECTOR)) {
+		irqentry_state_t state = irqentry_enter(regs);
+
+		instrumentation_begin();
+		sysvec_table[index](regs);
+		instrumentation_end();
+		irqentry_exit(regs, state);
+	} else {
+		common_interrupt(regs, vector);
+	}
+}
+
+static noinstr void fred_hwexc(struct pt_regs *regs, unsigned long error_code)
+{
+	/* Optimize for #PF. That's the only exception which matters performance wise */
+	if (likely(regs->fred_ss.vector == X86_TRAP_PF))
+		return exc_page_fault(regs, error_code);
+
+	switch (regs->fred_ss.vector) {
+	case X86_TRAP_DE: return exc_divide_error(regs);
+	case X86_TRAP_DB: return fred_exc_debug(regs);
+	case X86_TRAP_BR: return exc_bounds(regs);
+	case X86_TRAP_UD: return exc_invalid_op(regs);
+	case X86_TRAP_NM: return exc_device_not_available(regs);
+	case X86_TRAP_DF: return exc_double_fault(regs, error_code);
+	case X86_TRAP_TS: return exc_invalid_tss(regs, error_code);
+	case X86_TRAP_NP: return exc_segment_not_present(regs, error_code);
+	case X86_TRAP_SS: return exc_stack_segment(regs, error_code);
+	case X86_TRAP_GP: return exc_general_protection(regs, error_code);
+	case X86_TRAP_MF: return exc_coprocessor_error(regs);
+	case X86_TRAP_AC: return exc_alignment_check(regs, error_code);
+	case X86_TRAP_XF: return exc_simd_coprocessor_error(regs);
+
+#ifdef CONFIG_X86_MCE
+	case X86_TRAP_MC: return fred_exc_machine_check(regs);
+#endif
+#ifdef CONFIG_INTEL_TDX_GUEST
+	case X86_TRAP_VE: return exc_virtualization_exception(regs);
+#endif
+#ifdef CONFIG_X86_CET
+	case X86_TRAP_CP: return exc_control_protection(regs, error_code);
+#endif
+	default: return fred_bad_type(regs, error_code);
+	}
+
+}
+
+static noinstr void fred_swexc(struct pt_regs *regs, unsigned long error_code)
+{
+	switch (regs->fred_ss.vector) {
+	case X86_TRAP_BP: return exc_int3(regs);
+	case X86_TRAP_OF: return exc_overflow(regs);
+	default: return fred_bad_type(regs, error_code);
+	}
+}
+
+__visible noinstr void fred_entry_from_user(struct pt_regs *regs)
+{
+	unsigned long error_code = regs->orig_ax;
+
+	/* Invalidate orig_ax so that syscall_get_nr() works correctly */
+	regs->orig_ax = -1;
+
+	switch (regs->fred_ss.type) {
+	case EVENT_TYPE_EXTINT:
+		return fred_extint(regs);
+	case EVENT_TYPE_NMI:
+		if (likely(regs->fred_ss.vector == X86_TRAP_NMI))
+			return fred_exc_nmi(regs);
+		break;
+	case EVENT_TYPE_SWINT:
+		return fred_intx(regs);
+	case EVENT_TYPE_HWEXC:
+		return fred_hwexc(regs, error_code);
+	case EVENT_TYPE_SWEXC:
+		return fred_swexc(regs, error_code);
+	case EVENT_TYPE_PRIV_SWEXC:
+		if (likely(regs->fred_ss.vector == X86_TRAP_DB))
+			return fred_exc_debug(regs);
+		break;
+	case EVENT_TYPE_OTHER:
+		return fred_other(regs);
+	default: break;
+	}
+
+	return fred_bad_type(regs, error_code);
+}
+
+__visible noinstr void fred_entry_from_kernel(struct pt_regs *regs)
+{
+	unsigned long error_code = regs->orig_ax;
+
+	/* Invalidate orig_ax so that syscall_get_nr() works correctly */
+	regs->orig_ax = -1;
+
+	switch (regs->fred_ss.type) {
+	case EVENT_TYPE_EXTINT:
+		return fred_extint(regs);
+	case EVENT_TYPE_NMI:
+		if (likely(regs->fred_ss.vector == X86_TRAP_NMI))
+			return fred_exc_nmi(regs);
+		break;
+	case EVENT_TYPE_HWEXC:
+		return fred_hwexc(regs, error_code);
+	case EVENT_TYPE_SWEXC:
+		return fred_swexc(regs, error_code);
+	case EVENT_TYPE_PRIV_SWEXC:
+		if (likely(regs->fred_ss.vector == X86_TRAP_DB))
+			return fred_exc_debug(regs);
+		break;
+	default: break;
+	}
+
+	return fred_bad_type(regs, error_code);
+}
diff mbox series

Patch

diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile
index ca2fe186994b..c93e7f5c2a06 100644
--- a/arch/x86/entry/Makefile
+++ b/arch/x86/entry/Makefile
@@ -18,6 +18,9 @@  obj-y				+= vdso/
 obj-y				+= vsyscall/
 
 obj-$(CONFIG_PREEMPTION)	+= thunk_$(BITS).o
+CFLAGS_entry_fred.o		+= -fno-stack-protector
+CFLAGS_REMOVE_entry_fred.o	+= -pg $(CC_FLAGS_FTRACE)
+obj-$(CONFIG_X86_FRED)		+= entry_64_fred.o entry_fred.o
+
 obj-$(CONFIG_IA32_EMULATION)	+= entry_64_compat.o syscall_32.o
 obj-$(CONFIG_X86_X32_ABI)	+= syscall_x32.o
-
diff --git a/arch/x86/entry/entry_64_fred.S b/arch/x86/entry/entry_64_fred.S
new file mode 100644
index 000000000000..37a1dd5e8ace
--- /dev/null
+++ b/arch/x86/entry/entry_64_fred.S
@@ -0,0 +1,52 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * The actual FRED entry points.
+ */
+
+#include <asm/fred.h>
+
+#include "calling.h"
+
+	.code64
+	.section .noinstr.text, "ax"
+
+.macro FRED_ENTER
+	UNWIND_HINT_END_OF_STACK
+	ENDBR
+	PUSH_AND_CLEAR_REGS
+	movq	%rsp, %rdi	/* %rdi -> pt_regs */
+.endm
+
+.macro FRED_EXIT
+	UNWIND_HINT_REGS
+	POP_REGS
+.endm
+
+/*
+ * The new RIP value that FRED event delivery establishes is
+ * IA32_FRED_CONFIG & ~FFFH for events that occur in ring 3.
+ * Thus the FRED ring 3 entry point must be 4K page aligned.
+ */
+	.align 4096
+
+SYM_CODE_START_NOALIGN(asm_fred_entrypoint_user)
+	FRED_ENTER
+	call	fred_entry_from_user
+	FRED_EXIT
+	ERETU
+SYM_CODE_END(asm_fred_entrypoint_user)
+
+.fill asm_fred_entrypoint_kernel - ., 1, 0xcc
+
+/*
+ * The new RIP value that FRED event delivery establishes is
+ * (IA32_FRED_CONFIG & ~FFFH) + 256 for events that occur in
+ * ring 0, i.e., asm_fred_entrypoint_user + 256.
+ */
+	.org asm_fred_entrypoint_user + 256
+SYM_CODE_START_NOALIGN(asm_fred_entrypoint_kernel)
+	FRED_ENTER
+	call	fred_entry_from_kernel
+	FRED_EXIT
+	ERETS
+SYM_CODE_END(asm_fred_entrypoint_kernel)
diff --git a/arch/x86/entry/entry_fred.c b/arch/x86/entry/entry_fred.c
new file mode 100644
index 000000000000..215883e90f94
--- /dev/null
+++ b/arch/x86/entry/entry_fred.c
@@ -0,0 +1,230 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * The FRED specific kernel/user entry functions which are invoked from
+ * assembly code and dispatch to the associated handlers.
+ */
+#include <linux/kernel.h>
+#include <linux/kdebug.h>
+#include <linux/nospec.h>
+
+#include <asm/desc.h>
+#include <asm/fred.h>
+#include <asm/idtentry.h>
+#include <asm/syscall.h>
+#include <asm/trapnr.h>
+#include <asm/traps.h>
+
+/* FRED EVENT_TYPE_OTHER vector numbers */
+#define FRED_SYSCALL			1
+#define FRED_SYSENTER			2
+
+static noinstr void fred_bad_type(struct pt_regs *regs, unsigned long error_code)
+{
+	irqentry_state_t irq_state = irqentry_nmi_enter(regs);
+
+	instrumentation_begin();
+
+	/* Panic on events from a high stack level */
+	if (regs->fred_cs.sl > 0) {
+		pr_emerg("PANIC: invalid or fatal FRED event; event type %u "
+			 "vector %u error 0x%lx aux 0x%lx at %04x:%016lx\n",
+			 regs->fred_ss.type, regs->fred_ss.vector, regs->orig_ax,
+			 fred_event_data(regs), regs->cs, regs->ip);
+		die("invalid or fatal FRED event", regs, regs->orig_ax);
+		panic("invalid or fatal FRED event");
+	} else {
+		unsigned long flags = oops_begin();
+		int sig = SIGKILL;
+
+		pr_alert("BUG: invalid or fatal FRED event; event type %u "
+			 "vector %u error 0x%lx aux 0x%lx at %04x:%016lx\n",
+			 regs->fred_ss.type, regs->fred_ss.vector, regs->orig_ax,
+			 fred_event_data(regs), regs->cs, regs->ip);
+
+		if (__die("Invalid or fatal FRED event", regs, regs->orig_ax))
+			sig = 0;
+
+		oops_end(flags, regs, sig);
+	}
+
+	instrumentation_end();
+	irqentry_nmi_exit(regs, irq_state);
+}
+
+static noinstr void fred_intx(struct pt_regs *regs)
+{
+	switch (regs->fred_ss.vector) {
+	/* INT0 */
+	case X86_TRAP_OF:
+		exc_overflow(regs);
+		return;
+
+	/* INT3 */
+	case X86_TRAP_BP:
+		exc_int3(regs);
+		return;
+
+	/* INT80 */
+	case IA32_SYSCALL_VECTOR:
+		if (ia32_enabled()) {
+			/* Save the syscall number */
+			regs->orig_ax = regs->ax;
+			regs->ax = -ENOSYS;
+			do_int80_syscall_32(regs);
+			return;
+		}
+		fallthrough;
+
+	default:
+		exc_general_protection(regs, 0);
+		return;
+	}
+}
+
+static __always_inline void fred_other(struct pt_regs *regs)
+{
+	/* The compiler can fold these conditions into a single test */
+	if (likely(regs->fred_ss.vector == FRED_SYSCALL && regs->fred_ss.lm)) {
+		regs->orig_ax = regs->ax;
+		regs->ax = -ENOSYS;
+		do_syscall_64(regs, regs->orig_ax);
+		return;
+	} else if (ia32_enabled() &&
+		   likely(regs->fred_ss.vector == FRED_SYSENTER &&
+			  !regs->fred_ss.lm)) {
+		regs->orig_ax = regs->ax;
+		regs->ax = -ENOSYS;
+		do_fast_syscall_32(regs);
+		return;
+	} else {
+		exc_invalid_op(regs);
+		return;
+	}
+}
+
+#define SYSVEC(_vector, _function) [_vector - FIRST_SYSTEM_VECTOR] = fred_sysvec_##_function
+
+static idtentry_t sysvec_table[NR_SYSTEM_VECTORS] __ro_after_init = {
+	SYSVEC(ERROR_APIC_VECTOR,		error_interrupt),
+	SYSVEC(SPURIOUS_APIC_VECTOR,		spurious_apic_interrupt),
+	SYSVEC(LOCAL_TIMER_VECTOR,		apic_timer_interrupt),
+	SYSVEC(X86_PLATFORM_IPI_VECTOR,		x86_platform_ipi),
+
+	SYSVEC(RESCHEDULE_VECTOR,		reschedule_ipi),
+	SYSVEC(CALL_FUNCTION_SINGLE_VECTOR,	call_function_single),
+	SYSVEC(CALL_FUNCTION_VECTOR,		call_function),
+	SYSVEC(REBOOT_VECTOR,			reboot),
+
+	SYSVEC(THRESHOLD_APIC_VECTOR,		threshold),
+	SYSVEC(DEFERRED_ERROR_VECTOR,		deferred_error),
+	SYSVEC(THERMAL_APIC_VECTOR,		thermal),
+
+	SYSVEC(IRQ_WORK_VECTOR,			irq_work),
+
+	SYSVEC(POSTED_INTR_VECTOR,		kvm_posted_intr_ipi),
+	SYSVEC(POSTED_INTR_WAKEUP_VECTOR,	kvm_posted_intr_wakeup_ipi),
+	SYSVEC(POSTED_INTR_NESTED_VECTOR,	kvm_posted_intr_nested_ipi),
+};
+
+static noinstr void fred_extint(struct pt_regs *regs)
+{
+	unsigned int vector = regs->fred_ss.vector;
+
+	if (WARN_ON_ONCE(vector < FIRST_EXTERNAL_VECTOR))
+		return;
+
+	if (likely(vector >= FIRST_SYSTEM_VECTOR)) {
+		irqentry_state_t state = irqentry_enter(regs);
+
+		instrumentation_begin();
+		sysvec_table[vector - FIRST_SYSTEM_VECTOR](regs);
+		instrumentation_end();
+		irqentry_exit(regs, state);
+	} else {
+		common_interrupt(regs, vector);
+	}
+}
+
+static noinstr void fred_exception(struct pt_regs *regs, unsigned long error_code)
+{
+	/* Optimize for #PF. That's the only exception which matters performance wise */
+	if (likely(regs->fred_ss.vector == X86_TRAP_PF)) {
+		exc_page_fault(regs, error_code);
+		return;
+	}
+
+	switch (regs->fred_ss.vector) {
+	case X86_TRAP_DE: return exc_divide_error(regs);
+	case X86_TRAP_DB: return fred_exc_debug(regs);
+	case X86_TRAP_BP: return exc_int3(regs);
+	case X86_TRAP_OF: return exc_overflow(regs);
+	case X86_TRAP_BR: return exc_bounds(regs);
+	case X86_TRAP_UD: return exc_invalid_op(regs);
+	case X86_TRAP_NM: return exc_device_not_available(regs);
+	case X86_TRAP_DF: return exc_double_fault(regs, error_code);
+	case X86_TRAP_TS: return exc_invalid_tss(regs, error_code);
+	case X86_TRAP_NP: return exc_segment_not_present(regs, error_code);
+	case X86_TRAP_SS: return exc_stack_segment(regs, error_code);
+	case X86_TRAP_GP: return exc_general_protection(regs, error_code);
+	case X86_TRAP_MF: return exc_coprocessor_error(regs);
+	case X86_TRAP_AC: return exc_alignment_check(regs, error_code);
+	case X86_TRAP_XF: return exc_simd_coprocessor_error(regs);
+
+#ifdef CONFIG_X86_MCE
+	case X86_TRAP_MC: return fred_exc_machine_check(regs);
+#endif
+#ifdef CONFIG_INTEL_TDX_GUEST
+	case X86_TRAP_VE: return exc_virtualization_exception(regs);
+#endif
+#ifdef CONFIG_X86_KERNEL_IBT
+	case X86_TRAP_CP: return exc_control_protection(regs, error_code);
+#endif
+	default: return fred_bad_type(regs, error_code);
+	}
+}
+
+__visible noinstr void fred_entry_from_user(struct pt_regs *regs)
+{
+	unsigned long error_code = regs->orig_ax;
+
+	/* Invalidate orig_ax so that syscall_get_nr() works correctly */
+	regs->orig_ax = -1;
+
+	switch (regs->fred_ss.type) {
+	case EVENT_TYPE_EXTINT:
+		return fred_extint(regs);
+	case EVENT_TYPE_NMI:
+		return fred_exc_nmi(regs);
+	case EVENT_TYPE_SWINT:
+		return fred_intx(regs);
+	case EVENT_TYPE_HWEXC:
+	case EVENT_TYPE_SWEXC:
+	case EVENT_TYPE_PRIV_SWEXC:
+		return fred_exception(regs, error_code);
+	case EVENT_TYPE_OTHER:
+		return fred_other(regs);
+	default:
+		return fred_bad_type(regs, error_code);
+	}
+}
+
+__visible noinstr void fred_entry_from_kernel(struct pt_regs *regs)
+{
+	unsigned long error_code = regs->orig_ax;
+
+	/* Invalidate orig_ax so that syscall_get_nr() works correctly */
+	regs->orig_ax = -1;
+
+	switch (regs->fred_ss.type) {
+	case EVENT_TYPE_EXTINT:
+		return fred_extint(regs);
+	case EVENT_TYPE_NMI:
+		return fred_exc_nmi(regs);
+	case EVENT_TYPE_HWEXC:
+	case EVENT_TYPE_SWEXC:
+	case EVENT_TYPE_PRIV_SWEXC:
+		return fred_exception(regs, error_code);
+	default:
+		return fred_bad_type(regs, error_code);
+	}
+}
diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h
index b1a98fa38828..076bf8dee702 100644
--- a/arch/x86/include/asm/asm-prototypes.h
+++ b/arch/x86/include/asm/asm-prototypes.h
@@ -12,6 +12,7 @@ 
 #include <asm/special_insns.h>
 #include <asm/preempt.h>
 #include <asm/asm.h>
+#include <asm/fred.h>
 #include <asm/gsseg.h>
 
 #ifndef CONFIG_X86_CMPXCHG64
diff --git a/arch/x86/include/asm/fred.h b/arch/x86/include/asm/fred.h
index f514fdb5a39f..16a64ffecbf8 100644
--- a/arch/x86/include/asm/fred.h
+++ b/arch/x86/include/asm/fred.h
@@ -60,6 +60,12 @@  static __always_inline unsigned long fred_event_data(struct pt_regs *regs)
 	return fred_info(regs)->edata;
 }
 
+void asm_fred_entrypoint_user(void);
+void asm_fred_entrypoint_kernel(void);
+
+__visible void fred_entry_from_user(struct pt_regs *regs);
+__visible void fred_entry_from_kernel(struct pt_regs *regs);
+
 #else /* CONFIG_X86_FRED */
 static __always_inline unsigned long fred_event_data(struct pt_regs *regs) { return 0; }
 #endif /* CONFIG_X86_FRED */