diff mbox series

[RFC,06/13] x86/uintr: Introduce uintr receiver syscalls

Message ID 20210913200132.3396598-7-sohil.mehta@intel.com (mailing list archive)
State New
Headers show
Series x86 User Interrupts support | expand

Commit Message

Sohil Mehta Sept. 13, 2021, 8:01 p.m. UTC
Any application that wants to receive a user interrupt needs to register
an interrupt handler with the kernel. Add a registration syscall that
sets up the interrupt handler and the related kernel structures for
the task that makes this syscall.

Only one interrupt handler per task can be registered with the
kernel/hardware. Each task has its private interrupt vector space of 64
vectors. The vector registration and the related FD management is
covered later.

Also add an unregister syscall to let a task unregister the interrupt
handler.

The UPID for each receiver task needs to be updated whenever a task gets
context switched or it moves from one cpu to another. This will also be
covered later. The system calls haven't been wired up yet so no real
harm is done if we don't update the UPID right now.

<Code typically in the x86/kernel directory doesn't deal with file
descriptor management. I have kept uintr_fd.c separate to make it easier
to move it somewhere else if needed.>

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Sohil Mehta <sohil.mehta@intel.com>
---
 arch/x86/include/asm/processor.h |   6 +
 arch/x86/include/asm/uintr.h     |  13 ++
 arch/x86/kernel/Makefile         |   1 +
 arch/x86/kernel/uintr_core.c     | 240 +++++++++++++++++++++++++++++++
 arch/x86/kernel/uintr_fd.c       |  58 ++++++++
 5 files changed, 318 insertions(+)
 create mode 100644 arch/x86/include/asm/uintr.h
 create mode 100644 arch/x86/kernel/uintr_core.c
 create mode 100644 arch/x86/kernel/uintr_fd.c

Comments

Greg Kroah-Hartman Sept. 23, 2021, 12:26 p.m. UTC | #1
On Mon, Sep 13, 2021 at 01:01:25PM -0700, Sohil Mehta wrote:
> Any application that wants to receive a user interrupt needs to register
> an interrupt handler with the kernel. Add a registration syscall that
> sets up the interrupt handler and the related kernel structures for
> the task that makes this syscall.
> 
> Only one interrupt handler per task can be registered with the
> kernel/hardware. Each task has its private interrupt vector space of 64
> vectors. The vector registration and the related FD management is
> covered later.
> 
> Also add an unregister syscall to let a task unregister the interrupt
> handler.
> 
> The UPID for each receiver task needs to be updated whenever a task gets
> context switched or it moves from one cpu to another. This will also be
> covered later. The system calls haven't been wired up yet so no real
> harm is done if we don't update the UPID right now.
> 
> <Code typically in the x86/kernel directory doesn't deal with file
> descriptor management. I have kept uintr_fd.c separate to make it easier
> to move it somewhere else if needed.>
> 
> Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
> Signed-off-by: Sohil Mehta <sohil.mehta@intel.com>
> ---
>  arch/x86/include/asm/processor.h |   6 +
>  arch/x86/include/asm/uintr.h     |  13 ++
>  arch/x86/kernel/Makefile         |   1 +
>  arch/x86/kernel/uintr_core.c     | 240 +++++++++++++++++++++++++++++++
>  arch/x86/kernel/uintr_fd.c       |  58 ++++++++
>  5 files changed, 318 insertions(+)
>  create mode 100644 arch/x86/include/asm/uintr.h
>  create mode 100644 arch/x86/kernel/uintr_core.c
>  create mode 100644 arch/x86/kernel/uintr_fd.c
> 
> diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
> index 9ad2acaaae9b..d229bfac8b4f 100644
> --- a/arch/x86/include/asm/processor.h
> +++ b/arch/x86/include/asm/processor.h
> @@ -9,6 +9,7 @@ struct task_struct;
>  struct mm_struct;
>  struct io_bitmap;
>  struct vm86;
> +struct uintr_receiver;
>  
>  #include <asm/math_emu.h>
>  #include <asm/segment.h>
> @@ -529,6 +530,11 @@ struct thread_struct {
>  	 */
>  	u32			pkru;
>  
> +#ifdef CONFIG_X86_USER_INTERRUPTS
> +	/* User Interrupt state*/
> +	struct uintr_receiver	*ui_recv;
> +#endif
> +
>  	/* Floating point and extended processor state */
>  	struct fpu		fpu;
>  	/*
> diff --git a/arch/x86/include/asm/uintr.h b/arch/x86/include/asm/uintr.h
> new file mode 100644
> index 000000000000..4f35bd8bd4e0
> --- /dev/null
> +++ b/arch/x86/include/asm/uintr.h
> @@ -0,0 +1,13 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _ASM_X86_UINTR_H
> +#define _ASM_X86_UINTR_H
> +
> +#ifdef CONFIG_X86_USER_INTERRUPTS
> +
> +bool uintr_arch_enabled(void);
> +int do_uintr_register_handler(u64 handler);
> +int do_uintr_unregister_handler(void);
> +
> +#endif /* CONFIG_X86_USER_INTERRUPTS */
> +
> +#endif /* _ASM_X86_UINTR_H */
> diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
> index 8f4e8fa6ed75..060ca9f23e23 100644
> --- a/arch/x86/kernel/Makefile
> +++ b/arch/x86/kernel/Makefile
> @@ -140,6 +140,7 @@ obj-$(CONFIG_UPROBES)			+= uprobes.o
>  obj-$(CONFIG_PERF_EVENTS)		+= perf_regs.o
>  obj-$(CONFIG_TRACING)			+= tracepoint.o
>  obj-$(CONFIG_SCHED_MC_PRIO)		+= itmt.o
> +obj-$(CONFIG_X86_USER_INTERRUPTS)	+= uintr_fd.o uintr_core.o
>  obj-$(CONFIG_X86_UMIP)			+= umip.o
>  
>  obj-$(CONFIG_UNWINDER_ORC)		+= unwind_orc.o
> diff --git a/arch/x86/kernel/uintr_core.c b/arch/x86/kernel/uintr_core.c
> new file mode 100644
> index 000000000000..2c6042a6840a
> --- /dev/null
> +++ b/arch/x86/kernel/uintr_core.c
> @@ -0,0 +1,240 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (c) 2021, Intel Corporation.
> + *
> + * Sohil Mehta <sohil.mehta@intel.com>
> + * Jacob Pan <jacob.jun.pan@linux.intel.com>
> + */
> +#define pr_fmt(fmt)    "uintr: " fmt
> +
> +#include <linux/refcount.h>
> +#include <linux/sched.h>
> +#include <linux/sched/task.h>
> +#include <linux/slab.h>
> +#include <linux/uaccess.h>
> +
> +#include <asm/apic.h>
> +#include <asm/fpu/internal.h>
> +#include <asm/irq_vectors.h>
> +#include <asm/msr.h>
> +#include <asm/msr-index.h>
> +#include <asm/uintr.h>
> +
> +/* User Posted Interrupt Descriptor (UPID) */
> +struct uintr_upid {
> +	struct {
> +		u8 status;	/* bit 0: ON, bit 1: SN, bit 2-7: reserved */
> +		u8 reserved1;	/* Reserved */
> +		u8 nv;		/* Notification vector */
> +		u8 reserved2;	/* Reserved */

What are these "reserved" for?

> +		u32 ndst;	/* Notification destination */
> +	} nc __packed;		/* Notification control */
> +	u64 puir;		/* Posted user interrupt requests */
> +} __aligned(64);
> +
> +/* UPID Notification control status */
> +#define UPID_ON		0x0	/* Outstanding notification */
> +#define UPID_SN		0x1	/* Suppressed notification */
> +
> +struct uintr_upid_ctx {
> +	struct uintr_upid *upid;
> +	refcount_t refs;

Please use a kref for this and do not roll your own for no good reason.

> +/*
> + * sys_uintr_register_handler - setup user interrupt handler for receiver.
> + */
> +SYSCALL_DEFINE2(uintr_register_handler, u64 __user *, handler, unsigned int, flags)
> +{
> +	int ret;
> +
> +	if (!uintr_arch_enabled())
> +		return -EOPNOTSUPP;
> +
> +	if (flags)
> +		return -EINVAL;
> +
> +	/* TODO: Validate the handler address */
> +	if (!handler)
> +		return -EFAULT;

Um, that's a pretty big "TODO" here.

How are you going to define what is, and what is not, an allowed
"handler"?

I'm sure the security people would like to get involved here, as well as
the auditing people.  Have you talked with them about their requirements
for this type of stuff?

thanks,

greg k-h
Thomas Gleixner Sept. 23, 2021, 11:52 p.m. UTC | #2
On Mon, Sep 13 2021 at 13:01, Sohil Mehta wrote:
> +/* User Posted Interrupt Descriptor (UPID) */
> +struct uintr_upid {
> +	struct {
> +		u8 status;	/* bit 0: ON, bit 1: SN, bit 2-7: reserved */
> +		u8 reserved1;	/* Reserved */
> +		u8 nv;		/* Notification vector */
> +		u8 reserved2;	/* Reserved */
> +		u32 ndst;	/* Notification destination */
> +	} nc __packed;		/* Notification control */
> +	u64 puir;		/* Posted user interrupt requests */
> +} __aligned(64);
> +
> +/* UPID Notification control status */
> +#define UPID_ON		0x0	/* Outstanding notification */
> +#define UPID_SN		0x1	/* Suppressed notification */

Come on. This are bits in upid.status, right? So why can't the comment
above these defines says so and why can't the names not reflect that?

> +struct uintr_upid_ctx {
> +	struct uintr_upid *upid;
> +	refcount_t refs;

Please use tabular format for struct members. 

> +};
> +
> +struct uintr_receiver {
> +	struct uintr_upid_ctx *upid_ctx;
> +};

So we need a struct to wrap a pointer to another struct. Why?

> +inline bool uintr_arch_enabled(void)

What's this arch_enabled indirection for? Is this used anywhere in
non-architecture code?

> +{
> +	return static_cpu_has(X86_FEATURE_UINTR);
> +}
> +
> +static inline bool is_uintr_receiver(struct task_struct *t)
> +{
> +	return !!t->thread.ui_recv;
> +}
> +
> +static inline u32 cpu_to_ndst(int cpu)
> +{
> +	u32 apicid = (u32)apic->cpu_present_to_apicid(cpu);
> +
> +	WARN_ON_ONCE(apicid == BAD_APICID);

Brilliant. If x2apic is not enabled then this case returns

> +	if (!x2apic_enabled())
> +		return (apicid << 8) & 0xFF00;

  (BAD_APICID << 8) & 0xFF00 == 0xFF ....

> +int do_uintr_unregister_handler(void)
> +{
> +	struct task_struct *t = current;
> +	struct fpu *fpu = &t->thread.fpu;
> +	struct uintr_receiver *ui_recv;
> +	u64 msr64;
> +
> +	if (!is_uintr_receiver(t))
> +		return -EINVAL;
> +
> +	pr_debug("recv: Unregister handler and clear MSRs for task=%d\n",
> +		 t->pid);
> +
> +	/*
> +	 * TODO: Evaluate usage of fpregs_lock() and get_xsave_addr(). Bugs
> +	 * have been reported recently for PASID and WRPKRU.

Again. Which bugs and why haven't they been evaluated before posting?

> +	 * UPID and ui_recv will be referenced during context switch. Need to
> +	 * disable preemption while modifying the MSRs, UPID and ui_recv thread
> +	 * struct.
> +	 */
> +	fpregs_lock();

And because you need to disable preemption you need to use
fpregs_lock(), right? That's not what fpregs_lock() is about.

> +	/* Clear only the receiver specific state. Sender related state is not modified */
> +	if (fpregs_state_valid(fpu, smp_processor_id())) {
> +		/* Modify only the relevant bits of the MISC MSR */
> +		rdmsrl(MSR_IA32_UINTR_MISC, msr64);
> +		msr64 &= ~GENMASK_ULL(39, 32);

This is exactly the crap which results from not defining stuff
properly. Random numbers in code which nobody can understand.

> +		wrmsrl(MSR_IA32_UINTR_MISC, msr64);
> +		wrmsrl(MSR_IA32_UINTR_PD, 0ULL);
> +		wrmsrl(MSR_IA32_UINTR_RR, 0ULL);
> +		wrmsrl(MSR_IA32_UINTR_STACKADJUST, 0ULL);
> +		wrmsrl(MSR_IA32_UINTR_HANDLER, 0ULL);
> +	} else {
> +		struct uintr_state *p;
> +
> +		p = get_xsave_addr(&fpu->state.xsave, XFEATURE_UINTR);
> +		if (p) {
> +			p->handler = 0;
> +			p->stack_adjust = 0;
> +			p->upid_addr = 0;
> +			p->uinv = 0;
> +			p->uirr = 0;
> +		}

So p == NULL is expected here?

> +	}
> +
> +	ui_recv = t->thread.ui_recv;
> +	/*
> +	 * Suppress notifications so that no further interrupts are generated
> +	 * based on this UPID.
> +	 */
> +	set_bit(UPID_SN, (unsigned long *)&ui_recv->upid_ctx->upid->nc.status);
> +
> +	put_upid_ref(ui_recv->upid_ctx);
> +	kfree(ui_recv);
> +	t->thread.ui_recv = NULL;

Why has this put/kfree stuff to be in the fpregs locked section?

> +	fpregs_unlock();
> +
> +	return 0;
> +}
> +
> +int do_uintr_register_handler(u64 handler)
> +{
> +	struct uintr_receiver *ui_recv;
> +	struct uintr_upid *upid;
> +	struct task_struct *t = current;
> +	struct fpu *fpu = &t->thread.fpu;
> +	u64 misc_msr;
> +	int cpu;
> +
> +	if (is_uintr_receiver(t))
> +		return -EBUSY;
> +
> +	ui_recv = kzalloc(sizeof(*ui_recv), GFP_KERNEL);
> +	if (!ui_recv)
> +		return -ENOMEM;
> +
> +	ui_recv->upid_ctx = alloc_upid();
> +	if (!ui_recv->upid_ctx) {
> +		kfree(ui_recv);
> +		pr_debug("recv: alloc upid failed for task=%d\n", t->pid);
> +		return -ENOMEM;
> +	}
> +
> +	/*
> +	 * TODO: Evaluate usage of fpregs_lock() and get_xsave_addr(). Bugs
> +	 * have been reported recently for PASID and WRPKRU.

Oh well.

> +	 * UPID and ui_recv will be referenced during context switch. Need to
> +	 * disable preemption while modifying the MSRs, UPID and ui_recv thread
> +	 * struct.

See above.

> +	 */
> +	fpregs_lock();
> +
> +	cpu = smp_processor_id();
> +	upid = ui_recv->upid_ctx->upid;
> +	upid->nc.nv = UINTR_NOTIFICATION_VECTOR;
> +	upid->nc.ndst = cpu_to_ndst(cpu);
> +
> +	t->thread.ui_recv = ui_recv;
> +
> +	if (fpregs_state_valid(fpu, cpu)) {
> +		wrmsrl(MSR_IA32_UINTR_HANDLER, handler);
> +		wrmsrl(MSR_IA32_UINTR_PD, (u64)ui_recv->upid_ctx->upid);
> +
> +		/* Set value as size of ABI redzone */
> +		wrmsrl(MSR_IA32_UINTR_STACKADJUST, 128);
> +
> +		/* Modify only the relevant bits of the MISC MSR */
> +		rdmsrl(MSR_IA32_UINTR_MISC, misc_msr);
> +		misc_msr |= (u64)UINTR_NOTIFICATION_VECTOR << 32;
> +		wrmsrl(MSR_IA32_UINTR_MISC, misc_msr);
> +	} else {
> +		struct xregs_state *xsave;
> +		struct uintr_state *p;
> +
> +		xsave = &fpu->state.xsave;
> +		xsave->header.xfeatures |= XFEATURE_MASK_UINTR;
> +		p = get_xsave_addr(&fpu->state.xsave, XFEATURE_UINTR);
> +		if (p) {
> +			p->handler = handler;
> +			p->upid_addr = (u64)ui_recv->upid_ctx->upid;
> +			p->stack_adjust = 128;
> +			p->uinv = UINTR_NOTIFICATION_VECTOR;
> +		}

Again. How is p supposed to be NULL and if so, why is this silently
treating this as success?

> +	}
> +
> +	fpregs_unlock();

Thanks,

        tglx
Thomas Gleixner Sept. 24, 2021, 12:05 a.m. UTC | #3
On Thu, Sep 23 2021 at 14:26, Greg KH wrote:
> On Mon, Sep 13, 2021 at 01:01:25PM -0700, Sohil Mehta wrote:
>> +SYSCALL_DEFINE2(uintr_register_handler, u64 __user *, handler, unsigned int, flags)
>> +{
>> +	int ret;
>> +
>> +	if (!uintr_arch_enabled())
>> +		return -EOPNOTSUPP;
>> +
>> +	if (flags)
>> +		return -EINVAL;
>> +
>> +	/* TODO: Validate the handler address */
>> +	if (!handler)
>> +		return -EFAULT;
>
> Um, that's a pretty big "TODO" here.
>
> How are you going to define what is, and what is not, an allowed
> "handler"?

The requirement is obviously that this is a valid user space address,
but that's so hard to validate that it needs to be done later.

At least the documentation claims that a non user space address should
result in a #GP on delivery. Whether that holds in all corner cases (see
the spurious handling muck) is a different question and might come back
to us later through a channel which we hate with a passion :)

> I'm sure the security people would like to get involved here, as well as
> the auditing people.  Have you talked with them about their requirements
> for this type of stuff?

The handler is strictly a user space address and user space is generally
allowed to shoot itself into the foot. If the address is bogus then this
will resolve into inaccessible, not-mapped or not exectuable space and
the application can keep the pieces.

Whether the hardware handles the resulting exception correctly is a
different question, but that can't be prevented by any sanity check on
the address at registration time.

Thanks,

        tglx
Sohil Mehta Sept. 27, 2021, 11:20 p.m. UTC | #4
On 9/23/2021 5:26 AM, Greg KH wrote:
> On Mon, Sep 13, 2021 at 01:01:25PM -0700, Sohil Mehta wrote:
>> +
>> +/* User Posted Interrupt Descriptor (UPID) */
>> +struct uintr_upid {
>> +	struct {
>> +		u8 status;	/* bit 0: ON, bit 1: SN, bit 2-7: reserved */
>> +		u8 reserved1;	/* Reserved */
>> +		u8 nv;		/* Notification vector */
>> +		u8 reserved2;	/* Reserved */
> What are these "reserved" for?


The UPID is an architectural data structure defined by the hardware. The 
reserved fields are defined by the hardware (likely to keep the 
structure size as 16 bytes).


>
>> +struct uintr_upid_ctx {
>> +	struct uintr_upid *upid;
>> +	refcount_t refs;
> Please use a kref for this and do not roll your own for no good reason.

Sure. Will do.
Sohil Mehta Sept. 27, 2021, 11:57 p.m. UTC | #5
On 9/23/2021 4:52 PM, Thomas Gleixner wrote:
> On Mon, Sep 13 2021 at 13:01, Sohil Mehta wrote:
>
> +/* UPID Notification control status */
> +#define UPID_ON		0x0	/* Outstanding notification */
> +#define UPID_SN		0x1	/* Suppressed notification */
> Come on. This are bits in upid.status, right? So why can't the comment
> above these defines says so and why can't the names not reflect that?
I'll fix this.
>> +struct uintr_upid_ctx {
>> +	struct uintr_upid *upid;
>> +	refcount_t refs;
> Please use tabular format for struct members.
Will do.
>> +};
>> +
>> +struct uintr_receiver {
>> +	struct uintr_upid_ctx *upid_ctx;
>> +};
> So we need a struct to wrap a pointer to another struct. Why?

The struct will have more members added later.  Should the wrapper be 
created then?

I didn't want to add members that are not used in this patch.

>> +inline bool uintr_arch_enabled(void)
> What's this arch_enabled indirection for? Is this used anywhere in
> non-architecture code?


I'll remove this indirection.

It is a remnant of some older code that had uintr_fd managed outside of 
the x86 code.

>> +{
>> +	return static_cpu_has(X86_FEATURE_UINTR);
>> +}
>> +
>> +static inline bool is_uintr_receiver(struct task_struct *t)
>> +{
>> +	return !!t->thread.ui_recv;
>> +}
>> +
>> +static inline u32 cpu_to_ndst(int cpu)
>> +{
>> +	u32 apicid = (u32)apic->cpu_present_to_apicid(cpu);
>> +
>> +	WARN_ON_ONCE(apicid == BAD_APICID);
> Brilliant. If x2apic is not enabled then this case returns


I'll fix this.

>> +	if (!x2apic_enabled())
>> +		return (apicid << 8) & 0xFF00;
>    (BAD_APICID << 8) & 0xFF00 == 0xFF ....
>
>> +int do_uintr_unregister_handler(void)
>> +{
>> +	struct task_struct *t = current;
>> +	struct fpu *fpu = &t->thread.fpu;
>> +	struct uintr_receiver *ui_recv;
>> +	u64 msr64;
>> +
>> +	if (!is_uintr_receiver(t))
>> +		return -EINVAL;
>> +
>> +	pr_debug("recv: Unregister handler and clear MSRs for task=%d\n",
>> +		 t->pid);
>> +
>> +	/*
>> +	 * TODO: Evaluate usage of fpregs_lock() and get_xsave_addr(). Bugs
>> +	 * have been reported recently for PASID and WRPKRU.
> Again. Which bugs and why haven't they been evaluated before posting?
I apologize again. This comment is no longer valid.
>> +	 * UPID and ui_recv will be referenced during context switch. Need to
>> +	 * disable preemption while modifying the MSRs, UPID and ui_recv thread
>> +	 * struct.
>> +	 */
>> +	fpregs_lock();
> And because you need to disable preemption you need to use
> fpregs_lock(), right? That's not what fpregs_lock() is about.
>
Got it. I'll evaluate the use of fpregs_lock() at all places.
>> +		wrmsrl(MSR_IA32_UINTR_MISC, msr64);
>> +		wrmsrl(MSR_IA32_UINTR_PD, 0ULL);
>> +		wrmsrl(MSR_IA32_UINTR_RR, 0ULL);
>> +		wrmsrl(MSR_IA32_UINTR_STACKADJUST, 0ULL);
>> +		wrmsrl(MSR_IA32_UINTR_HANDLER, 0ULL);
>> +	} else {
>> +		struct uintr_state *p;
>> +
>> +		p = get_xsave_addr(&fpu->state.xsave, XFEATURE_UINTR);
>> +		if (p) {
>> +			p->handler = 0;
>> +			p->stack_adjust = 0;
>> +			p->upid_addr = 0;
>> +			p->uinv = 0;
>> +			p->uirr = 0;
>> +		}
> So p == NULL is expected here?
I'll fix this and other usages of get_xsave_addr().

Thanks,

Sohil
Greg Kroah-Hartman Sept. 28, 2021, 4:39 a.m. UTC | #6
On Mon, Sep 27, 2021 at 04:20:25PM -0700, Sohil Mehta wrote:
> On 9/23/2021 5:26 AM, Greg KH wrote:
> > On Mon, Sep 13, 2021 at 01:01:25PM -0700, Sohil Mehta wrote:
> > > +
> > > +/* User Posted Interrupt Descriptor (UPID) */
> > > +struct uintr_upid {
> > > +	struct {
> > > +		u8 status;	/* bit 0: ON, bit 1: SN, bit 2-7: reserved */
> > > +		u8 reserved1;	/* Reserved */
> > > +		u8 nv;		/* Notification vector */
> > > +		u8 reserved2;	/* Reserved */
> > What are these "reserved" for?
> 
> 
> The UPID is an architectural data structure defined by the hardware. The
> reserved fields are defined by the hardware (likely to keep the structure
> size as 16 bytes).

Then those values must be set to 0, right?  I think I missed the part of
the code that set them, hopefully it's somewhere...

thanks,

greg k-h
Sohil Mehta Sept. 28, 2021, 4:47 p.m. UTC | #7
On 9/27/2021 9:39 PM, Greg KH wrote:
>
> Then those values must be set to 0, right?  I think I missed the part of
> the code that set them, hopefully it's somewhere...

Yes. The kzalloc() as part of alloc_upid() does it.

Thanks,

Sohil
diff mbox series

Patch

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 9ad2acaaae9b..d229bfac8b4f 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -9,6 +9,7 @@  struct task_struct;
 struct mm_struct;
 struct io_bitmap;
 struct vm86;
+struct uintr_receiver;
 
 #include <asm/math_emu.h>
 #include <asm/segment.h>
@@ -529,6 +530,11 @@  struct thread_struct {
 	 */
 	u32			pkru;
 
+#ifdef CONFIG_X86_USER_INTERRUPTS
+	/* User Interrupt state*/
+	struct uintr_receiver	*ui_recv;
+#endif
+
 	/* Floating point and extended processor state */
 	struct fpu		fpu;
 	/*
diff --git a/arch/x86/include/asm/uintr.h b/arch/x86/include/asm/uintr.h
new file mode 100644
index 000000000000..4f35bd8bd4e0
--- /dev/null
+++ b/arch/x86/include/asm/uintr.h
@@ -0,0 +1,13 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_UINTR_H
+#define _ASM_X86_UINTR_H
+
+#ifdef CONFIG_X86_USER_INTERRUPTS
+
+bool uintr_arch_enabled(void);
+int do_uintr_register_handler(u64 handler);
+int do_uintr_unregister_handler(void);
+
+#endif /* CONFIG_X86_USER_INTERRUPTS */
+
+#endif /* _ASM_X86_UINTR_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 8f4e8fa6ed75..060ca9f23e23 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -140,6 +140,7 @@  obj-$(CONFIG_UPROBES)			+= uprobes.o
 obj-$(CONFIG_PERF_EVENTS)		+= perf_regs.o
 obj-$(CONFIG_TRACING)			+= tracepoint.o
 obj-$(CONFIG_SCHED_MC_PRIO)		+= itmt.o
+obj-$(CONFIG_X86_USER_INTERRUPTS)	+= uintr_fd.o uintr_core.o
 obj-$(CONFIG_X86_UMIP)			+= umip.o
 
 obj-$(CONFIG_UNWINDER_ORC)		+= unwind_orc.o
diff --git a/arch/x86/kernel/uintr_core.c b/arch/x86/kernel/uintr_core.c
new file mode 100644
index 000000000000..2c6042a6840a
--- /dev/null
+++ b/arch/x86/kernel/uintr_core.c
@@ -0,0 +1,240 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2021, Intel Corporation.
+ *
+ * Sohil Mehta <sohil.mehta@intel.com>
+ * Jacob Pan <jacob.jun.pan@linux.intel.com>
+ */
+#define pr_fmt(fmt)    "uintr: " fmt
+
+#include <linux/refcount.h>
+#include <linux/sched.h>
+#include <linux/sched/task.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+#include <asm/apic.h>
+#include <asm/fpu/internal.h>
+#include <asm/irq_vectors.h>
+#include <asm/msr.h>
+#include <asm/msr-index.h>
+#include <asm/uintr.h>
+
+/* User Posted Interrupt Descriptor (UPID) */
+struct uintr_upid {
+	struct {
+		u8 status;	/* bit 0: ON, bit 1: SN, bit 2-7: reserved */
+		u8 reserved1;	/* Reserved */
+		u8 nv;		/* Notification vector */
+		u8 reserved2;	/* Reserved */
+		u32 ndst;	/* Notification destination */
+	} nc __packed;		/* Notification control */
+	u64 puir;		/* Posted user interrupt requests */
+} __aligned(64);
+
+/* UPID Notification control status */
+#define UPID_ON		0x0	/* Outstanding notification */
+#define UPID_SN		0x1	/* Suppressed notification */
+
+struct uintr_upid_ctx {
+	struct uintr_upid *upid;
+	refcount_t refs;
+};
+
+struct uintr_receiver {
+	struct uintr_upid_ctx *upid_ctx;
+};
+
+inline bool uintr_arch_enabled(void)
+{
+	return static_cpu_has(X86_FEATURE_UINTR);
+}
+
+static inline bool is_uintr_receiver(struct task_struct *t)
+{
+	return !!t->thread.ui_recv;
+}
+
+static inline u32 cpu_to_ndst(int cpu)
+{
+	u32 apicid = (u32)apic->cpu_present_to_apicid(cpu);
+
+	WARN_ON_ONCE(apicid == BAD_APICID);
+
+	if (!x2apic_enabled())
+		return (apicid << 8) & 0xFF00;
+
+	return apicid;
+}
+
+static void free_upid(struct uintr_upid_ctx *upid_ctx)
+{
+	kfree(upid_ctx->upid);
+	upid_ctx->upid = NULL;
+	kfree(upid_ctx);
+}
+
+/* TODO: UPID needs to be allocated by a KPTI compatible allocator */
+static struct uintr_upid_ctx *alloc_upid(void)
+{
+	struct uintr_upid_ctx *upid_ctx;
+	struct uintr_upid *upid;
+
+	upid_ctx = kzalloc(sizeof(*upid_ctx), GFP_KERNEL);
+	if (!upid_ctx)
+		return NULL;
+
+	upid = kzalloc(sizeof(*upid), GFP_KERNEL);
+
+	if (!upid) {
+		kfree(upid_ctx);
+		return NULL;
+	}
+
+	upid_ctx->upid = upid;
+	refcount_set(&upid_ctx->refs, 1);
+
+	return upid_ctx;
+}
+
+static void put_upid_ref(struct uintr_upid_ctx *upid_ctx)
+{
+	if (refcount_dec_and_test(&upid_ctx->refs))
+		free_upid(upid_ctx);
+}
+
+int do_uintr_unregister_handler(void)
+{
+	struct task_struct *t = current;
+	struct fpu *fpu = &t->thread.fpu;
+	struct uintr_receiver *ui_recv;
+	u64 msr64;
+
+	if (!is_uintr_receiver(t))
+		return -EINVAL;
+
+	pr_debug("recv: Unregister handler and clear MSRs for task=%d\n",
+		 t->pid);
+
+	/*
+	 * TODO: Evaluate usage of fpregs_lock() and get_xsave_addr(). Bugs
+	 * have been reported recently for PASID and WRPKRU.
+	 *
+	 * UPID and ui_recv will be referenced during context switch. Need to
+	 * disable preemption while modifying the MSRs, UPID and ui_recv thread
+	 * struct.
+	 */
+	fpregs_lock();
+
+	/* Clear only the receiver specific state. Sender related state is not modified */
+	if (fpregs_state_valid(fpu, smp_processor_id())) {
+		/* Modify only the relevant bits of the MISC MSR */
+		rdmsrl(MSR_IA32_UINTR_MISC, msr64);
+		msr64 &= ~GENMASK_ULL(39, 32);
+		wrmsrl(MSR_IA32_UINTR_MISC, msr64);
+		wrmsrl(MSR_IA32_UINTR_PD, 0ULL);
+		wrmsrl(MSR_IA32_UINTR_RR, 0ULL);
+		wrmsrl(MSR_IA32_UINTR_STACKADJUST, 0ULL);
+		wrmsrl(MSR_IA32_UINTR_HANDLER, 0ULL);
+	} else {
+		struct uintr_state *p;
+
+		p = get_xsave_addr(&fpu->state.xsave, XFEATURE_UINTR);
+		if (p) {
+			p->handler = 0;
+			p->stack_adjust = 0;
+			p->upid_addr = 0;
+			p->uinv = 0;
+			p->uirr = 0;
+		}
+	}
+
+	ui_recv = t->thread.ui_recv;
+	/*
+	 * Suppress notifications so that no further interrupts are generated
+	 * based on this UPID.
+	 */
+	set_bit(UPID_SN, (unsigned long *)&ui_recv->upid_ctx->upid->nc.status);
+
+	put_upid_ref(ui_recv->upid_ctx);
+	kfree(ui_recv);
+	t->thread.ui_recv = NULL;
+
+	fpregs_unlock();
+
+	return 0;
+}
+
+int do_uintr_register_handler(u64 handler)
+{
+	struct uintr_receiver *ui_recv;
+	struct uintr_upid *upid;
+	struct task_struct *t = current;
+	struct fpu *fpu = &t->thread.fpu;
+	u64 misc_msr;
+	int cpu;
+
+	if (is_uintr_receiver(t))
+		return -EBUSY;
+
+	ui_recv = kzalloc(sizeof(*ui_recv), GFP_KERNEL);
+	if (!ui_recv)
+		return -ENOMEM;
+
+	ui_recv->upid_ctx = alloc_upid();
+	if (!ui_recv->upid_ctx) {
+		kfree(ui_recv);
+		pr_debug("recv: alloc upid failed for task=%d\n", t->pid);
+		return -ENOMEM;
+	}
+
+	/*
+	 * TODO: Evaluate usage of fpregs_lock() and get_xsave_addr(). Bugs
+	 * have been reported recently for PASID and WRPKRU.
+	 *
+	 * UPID and ui_recv will be referenced during context switch. Need to
+	 * disable preemption while modifying the MSRs, UPID and ui_recv thread
+	 * struct.
+	 */
+	fpregs_lock();
+
+	cpu = smp_processor_id();
+	upid = ui_recv->upid_ctx->upid;
+	upid->nc.nv = UINTR_NOTIFICATION_VECTOR;
+	upid->nc.ndst = cpu_to_ndst(cpu);
+
+	t->thread.ui_recv = ui_recv;
+
+	if (fpregs_state_valid(fpu, cpu)) {
+		wrmsrl(MSR_IA32_UINTR_HANDLER, handler);
+		wrmsrl(MSR_IA32_UINTR_PD, (u64)ui_recv->upid_ctx->upid);
+
+		/* Set value as size of ABI redzone */
+		wrmsrl(MSR_IA32_UINTR_STACKADJUST, 128);
+
+		/* Modify only the relevant bits of the MISC MSR */
+		rdmsrl(MSR_IA32_UINTR_MISC, misc_msr);
+		misc_msr |= (u64)UINTR_NOTIFICATION_VECTOR << 32;
+		wrmsrl(MSR_IA32_UINTR_MISC, misc_msr);
+	} else {
+		struct xregs_state *xsave;
+		struct uintr_state *p;
+
+		xsave = &fpu->state.xsave;
+		xsave->header.xfeatures |= XFEATURE_MASK_UINTR;
+		p = get_xsave_addr(&fpu->state.xsave, XFEATURE_UINTR);
+		if (p) {
+			p->handler = handler;
+			p->upid_addr = (u64)ui_recv->upid_ctx->upid;
+			p->stack_adjust = 128;
+			p->uinv = UINTR_NOTIFICATION_VECTOR;
+		}
+	}
+
+	fpregs_unlock();
+
+	pr_debug("recv: task=%d register handler=%llx upid %px\n",
+		 t->pid, handler, upid);
+
+	return 0;
+}
diff --git a/arch/x86/kernel/uintr_fd.c b/arch/x86/kernel/uintr_fd.c
new file mode 100644
index 000000000000..a1a9c105fdab
--- /dev/null
+++ b/arch/x86/kernel/uintr_fd.c
@@ -0,0 +1,58 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2021, Intel Corporation.
+ *
+ * Sohil Mehta <sohil.mehta@intel.com>
+ */
+#define pr_fmt(fmt)	"uintr: " fmt
+
+#include <linux/sched.h>
+#include <linux/syscalls.h>
+
+#include <asm/uintr.h>
+
+/*
+ * sys_uintr_register_handler - setup user interrupt handler for receiver.
+ */
+SYSCALL_DEFINE2(uintr_register_handler, u64 __user *, handler, unsigned int, flags)
+{
+	int ret;
+
+	if (!uintr_arch_enabled())
+		return -EOPNOTSUPP;
+
+	if (flags)
+		return -EINVAL;
+
+	/* TODO: Validate the handler address */
+	if (!handler)
+		return -EFAULT;
+
+	ret = do_uintr_register_handler((u64)handler);
+
+	pr_debug("recv: register handler task=%d flags %d handler %lx ret %d\n",
+		 current->pid, flags, (unsigned long)handler, ret);
+
+	return ret;
+}
+
+/*
+ * sys_uintr_unregister_handler - Teardown user interrupt handler for receiver.
+ */
+SYSCALL_DEFINE1(uintr_unregister_handler, unsigned int, flags)
+{
+	int ret;
+
+	if (!uintr_arch_enabled())
+		return -EOPNOTSUPP;
+
+	if (flags)
+		return -EINVAL;
+
+	ret = do_uintr_unregister_handler();
+
+	pr_debug("recv: unregister handler task=%d flags %d ret %d\n",
+		 current->pid, flags, ret);
+
+	return ret;
+}