diff mbox series

[RFC,09/13] x86/uintr: Introduce vector registration and uintr_fd syscall

Message ID 20210913200132.3396598-10-sohil.mehta@intel.com (mailing list archive)
State New
Headers show
Series x86 User Interrupts support | expand

Commit Message

Sohil Mehta Sept. 13, 2021, 8:01 p.m. UTC
Each receiving task has its own interrupt vector space of 64 vectors.
For each vector registered by a task create a uintr_fd. Only tasks that
have previously registered a user interrupt handler can register a
vector.

The sender for the user interrupt could be another userspace
application, kernel or an external source (like a device). Any sender
that wants to generate a user interrupt needs access to receiver's
vector number and UPID.  uintr_fd abstracts that information and allows
a sender with access to uintr_fd to connect and generate a user
interrupt. Upon interrupt delivery, the interrupt handler would be
invoked with the associated vector number pushed onto the stack.

Using an FD abstraction automatically provides a secure mechanism to
connect with a receiver. It also makes the tracking and management of
the interrupt vector resource easier for userspace.

uintr_fd can be useful in some of the usages where eventfd is used for
userspace event notifications. Though uintr_fd is nowhere close to a
drop-in replacement, the semantics are meant to be somewhat similar to
an eventfd or the write end of a pipe.

Access to uintr_fd can be achieved in the following ways:
- direct access if the task is part of the same thread group (process)
- inherited by a child process.
- explicitly shared using any of the FD sharing mechanisms.

If the sender is another userspace task, it can use the uintr_fd to send
user IPIs to the receiver. This works in conjunction with the SENDUIPI
instruction. The details related to this are covered later.

The exact APIs for the sender being a kernel or another external source
are still being worked upon. The general idea is that the receiver would
pass the uintr_fd to the kernel by extending some existing API (like
io_uring).

The vector associated with uintr_fd can be unregistered by closing all
references to the uintr_fd.

Signed-off-by: Sohil Mehta <sohil.mehta@intel.com>
---
 arch/x86/include/asm/uintr.h |  14 ++++
 arch/x86/kernel/uintr_core.c | 129 +++++++++++++++++++++++++++++++++--
 arch/x86/kernel/uintr_fd.c   |  94 +++++++++++++++++++++++++
 3 files changed, 232 insertions(+), 5 deletions(-)

Comments

Thomas Gleixner Sept. 24, 2021, 10:33 a.m. UTC | #1
On Mon, Sep 13 2021 at 13:01, Sohil Mehta wrote:
>  static void free_upid(struct uintr_upid_ctx *upid_ctx)
>  {
> +	put_task_struct(upid_ctx->task);

That's current, right?

>  	kfree(upid_ctx->upid);
>  	upid_ctx->upid = NULL;
>  	kfree(upid_ctx);
> @@ -93,6 +93,7 @@ static struct uintr_upid_ctx *alloc_upid(void)
>  
>  	upid_ctx->upid = upid;
>  	refcount_set(&upid_ctx->refs, 1);
> +	upid_ctx->task = get_task_struct(current);

Current takes a refcount on it's own task struct when allocating upid,
and releases it at some point when freeing upid, right?

What for? Comments are overrated, except for comments describing
the obvious in the wrong way.

If this ever comes back in some form, then I pretty please want the life
time rules of this documented properly. The current state is
unreviewable.

>  	return upid_ctx;
>  }
> @@ -103,6 +104,77 @@ static void put_upid_ref(struct uintr_upid_ctx *upid_ctx)
>  		free_upid(upid_ctx);
>  }
>  
> +static struct uintr_upid_ctx *get_upid_ref(struct uintr_upid_ctx *upid_ctx)
> +{
> +	refcount_inc(&upid_ctx->refs);
> +	return upid_ctx;
> +}
> +
> +static void __clear_vector_from_upid(u64 uvec, struct uintr_upid *upid)
> +{
> +	clear_bit(uvec, (unsigned long *)&upid->puir);
> +}
> +
> +static void __clear_vector_from_task(u64 uvec)
> +{
> +	struct task_struct *t = current;
> +
> +	pr_debug("recv: task=%d free vector %llu\n", t->pid, uvec);
> +
> +	if (!(BIT_ULL(uvec) & t->thread.ui_recv->uvec_mask))
> +		return;
> +
> +	clear_bit(uvec, (unsigned long *)&t->thread.ui_recv->uvec_mask);
> +
> +	if (!t->thread.ui_recv->uvec_mask)
> +		pr_debug("recv: task=%d unregistered all user vectors\n", t->pid);

Once you are done debugging this complex function can you please turn it
into an unconditional clear_bit(...) at the call site?

> +/* Callback to clear the vector structures when a vector is unregistered. */
> +static void receiver_clear_uvec(struct callback_head *head)
> +{
> +	struct uintr_receiver_info *r_info;
> +	struct uintr_upid_ctx *upid_ctx;
> +	struct task_struct *t = current;
> +	u64 uvec;
> +
> +	r_info = container_of(head, struct uintr_receiver_info, twork);
> +	uvec = r_info->uvec;
> +	upid_ctx = r_info->upid_ctx;
> +
> +	/*
> +	 * If a task has unregistered the interrupt handler the vector
> +	 * structures would have already been cleared.

would ? No. They must have been cleared already, anything else is a bug.

> +	 */
> +	if (is_uintr_receiver(t)) {
> +		/*
> +		 * The UPID context in the callback might differ from the one
> +		 * on the task if the task unregistered its interrupt handler
> +		 * and then registered itself again. The vector structures
> +		 * related to the previous UPID would have already been cleared
> +		 * in that case.
> +		 */
> +		if (t->thread.ui_recv->upid_ctx != upid_ctx) {
> +			pr_debug("recv: task %d is now using a different UPID\n",
> +				 t->pid);
> +			goto out_free;
> +		}
> +
> +		/*
> +		 * If the vector has been recognized in the UIRR don't modify
> +		 * it. We need to disable User Interrupts before modifying the
> +		 * UIRR. It might be better to just let that interrupt get
> +		 * delivered.

Might be better? Please provide coherent explanations why this is correct.

> +		 */
> +		__clear_vector_from_upid(uvec, upid_ctx->upid);
> +		__clear_vector_from_task(uvec);
> +	}
> +
> +out_free:
> +	put_upid_ref(upid_ctx);
> +	kfree(r_info);
> +}
> +
>  int do_uintr_unregister_handler(void)
>  {
>  	struct task_struct *t = current;
> @@ -239,6 +311,53 @@ int do_uintr_register_handler(u64 handler)
>  	return 0;
>  }
>  
> +void do_uintr_unregister_vector(struct uintr_receiver_info *r_info)
> +{
> +	int ret;
> +
> +	pr_debug("recv: Adding task work to clear vector %llu added for task=%d\n",
> +		 r_info->uvec, r_info->upid_ctx->task->pid);
> +
> +	init_task_work(&r_info->twork, receiver_clear_uvec);

How is this safe? Reinitialization has to be serialized against other
usage. Again: Document the life time and serialization rules. Your
pr_debugs sprinkled all over the place are not a replacement for that.

> +	ret = task_work_add(r_info->upid_ctx->task, &r_info->twork, true);

Care to look at the type of the third argument of task_work_add()?

> +struct uintrfd_ctx {
> +	struct uintr_receiver_info *r_info;

Yet another wrapper struct? What for?

> +/*
> + * sys_uintr_create_fd - Create a uintr_fd for the registered interrupt vector.

So this creates a file descriptor for a vector which is already
allocated and then it calls do_uintr_register_vector() which allocates
the vector?

> + */
> +SYSCALL_DEFINE2(uintr_create_fd, u64, vector, unsigned int, flags)
> +{

Thanks,

        tglx
Sohil Mehta Sept. 28, 2021, 8:40 p.m. UTC | #2
On 9/24/2021 3:33 AM, Thomas Gleixner wrote:
> If this ever comes back in some form, then I pretty please want the life
> time rules of this documented properly.

I'll document the life time rules for the UPID, vector and UITT next 
time. I realize now that they are quite convoluted in the current 
implementation.

I'll also fix the concurrency and serialization issues that you 
mentioned in this patch and the next one.


>
>> +	ret = task_work_add(r_info->upid_ctx->task, &r_info->twork, true);
> Care to look at the type of the third argument of task_work_add()?

Ah! I didn't realize the third argument changed a long time back.


>
>> +/*
>> + * sys_uintr_create_fd - Create a uintr_fd for the registered interrupt vector.
> So this creates a file descriptor for a vector which is already
> allocated and then it calls do_uintr_register_vector() which allocates
> the vector?

The syscall comment is misleading. Will fix it.

Vector allocation happens in userspace. The application is only expected 
to register the vector.

This syscall only creates an FD abstraction for the vector that is 
*being* registered.

>> + */
>> +SYSCALL_DEFINE2(uintr_create_fd, u64, vector, unsigned int, flags)
>> +{

Thanks,

Sohil
diff mbox series

Patch

diff --git a/arch/x86/include/asm/uintr.h b/arch/x86/include/asm/uintr.h
index cef4dd81d40e..1f00e2a63da4 100644
--- a/arch/x86/include/asm/uintr.h
+++ b/arch/x86/include/asm/uintr.h
@@ -4,9 +4,23 @@ 
 
 #ifdef CONFIG_X86_USER_INTERRUPTS
 
+struct uintr_upid_ctx {
+	struct task_struct *task;	/* Receiver task */
+	struct uintr_upid *upid;
+	refcount_t refs;
+};
+
+struct uintr_receiver_info {
+	struct uintr_upid_ctx *upid_ctx;	/* UPID context */
+	struct callback_head twork;		/* Task work head */
+	u64 uvec;				/* Vector number */
+};
+
 bool uintr_arch_enabled(void);
 int do_uintr_register_handler(u64 handler);
 int do_uintr_unregister_handler(void);
+int do_uintr_register_vector(struct uintr_receiver_info *r_info);
+void do_uintr_unregister_vector(struct uintr_receiver_info *r_info);
 
 void uintr_free(struct task_struct *task);
 
diff --git a/arch/x86/kernel/uintr_core.c b/arch/x86/kernel/uintr_core.c
index a2a13f890139..9dcb9f60e5bc 100644
--- a/arch/x86/kernel/uintr_core.c
+++ b/arch/x86/kernel/uintr_core.c
@@ -11,6 +11,7 @@ 
 #include <linux/sched.h>
 #include <linux/sched/task.h>
 #include <linux/slab.h>
+#include <linux/task_work.h>
 #include <linux/uaccess.h>
 
 #include <asm/apic.h>
@@ -20,6 +21,8 @@ 
 #include <asm/msr-index.h>
 #include <asm/uintr.h>
 
+#define UINTR_MAX_UVEC_NR 64
+
 /* User Posted Interrupt Descriptor (UPID) */
 struct uintr_upid {
 	struct {
@@ -36,13 +39,9 @@  struct uintr_upid {
 #define UPID_ON		0x0	/* Outstanding notification */
 #define UPID_SN		0x1	/* Suppressed notification */
 
-struct uintr_upid_ctx {
-	struct uintr_upid *upid;
-	refcount_t refs;
-};
-
 struct uintr_receiver {
 	struct uintr_upid_ctx *upid_ctx;
+	u64 uvec_mask;	/* track active vector per bit */
 };
 
 inline bool uintr_arch_enabled(void)
@@ -69,6 +68,7 @@  static inline u32 cpu_to_ndst(int cpu)
 
 static void free_upid(struct uintr_upid_ctx *upid_ctx)
 {
+	put_task_struct(upid_ctx->task);
 	kfree(upid_ctx->upid);
 	upid_ctx->upid = NULL;
 	kfree(upid_ctx);
@@ -93,6 +93,7 @@  static struct uintr_upid_ctx *alloc_upid(void)
 
 	upid_ctx->upid = upid;
 	refcount_set(&upid_ctx->refs, 1);
+	upid_ctx->task = get_task_struct(current);
 
 	return upid_ctx;
 }
@@ -103,6 +104,77 @@  static void put_upid_ref(struct uintr_upid_ctx *upid_ctx)
 		free_upid(upid_ctx);
 }
 
+static struct uintr_upid_ctx *get_upid_ref(struct uintr_upid_ctx *upid_ctx)
+{
+	refcount_inc(&upid_ctx->refs);
+	return upid_ctx;
+}
+
+static void __clear_vector_from_upid(u64 uvec, struct uintr_upid *upid)
+{
+	clear_bit(uvec, (unsigned long *)&upid->puir);
+}
+
+static void __clear_vector_from_task(u64 uvec)
+{
+	struct task_struct *t = current;
+
+	pr_debug("recv: task=%d free vector %llu\n", t->pid, uvec);
+
+	if (!(BIT_ULL(uvec) & t->thread.ui_recv->uvec_mask))
+		return;
+
+	clear_bit(uvec, (unsigned long *)&t->thread.ui_recv->uvec_mask);
+
+	if (!t->thread.ui_recv->uvec_mask)
+		pr_debug("recv: task=%d unregistered all user vectors\n", t->pid);
+}
+
+/* Callback to clear the vector structures when a vector is unregistered. */
+static void receiver_clear_uvec(struct callback_head *head)
+{
+	struct uintr_receiver_info *r_info;
+	struct uintr_upid_ctx *upid_ctx;
+	struct task_struct *t = current;
+	u64 uvec;
+
+	r_info = container_of(head, struct uintr_receiver_info, twork);
+	uvec = r_info->uvec;
+	upid_ctx = r_info->upid_ctx;
+
+	/*
+	 * If a task has unregistered the interrupt handler the vector
+	 * structures would have already been cleared.
+	 */
+	if (is_uintr_receiver(t)) {
+		/*
+		 * The UPID context in the callback might differ from the one
+		 * on the task if the task unregistered its interrupt handler
+		 * and then registered itself again. The vector structures
+		 * related to the previous UPID would have already been cleared
+		 * in that case.
+		 */
+		if (t->thread.ui_recv->upid_ctx != upid_ctx) {
+			pr_debug("recv: task %d is now using a different UPID\n",
+				 t->pid);
+			goto out_free;
+		}
+
+		/*
+		 * If the vector has been recognized in the UIRR don't modify
+		 * it. We need to disable User Interrupts before modifying the
+		 * UIRR. It might be better to just let that interrupt get
+		 * delivered.
+		 */
+		__clear_vector_from_upid(uvec, upid_ctx->upid);
+		__clear_vector_from_task(uvec);
+	}
+
+out_free:
+	put_upid_ref(upid_ctx);
+	kfree(r_info);
+}
+
 int do_uintr_unregister_handler(void)
 {
 	struct task_struct *t = current;
@@ -239,6 +311,53 @@  int do_uintr_register_handler(u64 handler)
 	return 0;
 }
 
+void do_uintr_unregister_vector(struct uintr_receiver_info *r_info)
+{
+	int ret;
+
+	pr_debug("recv: Adding task work to clear vector %llu added for task=%d\n",
+		 r_info->uvec, r_info->upid_ctx->task->pid);
+
+	init_task_work(&r_info->twork, receiver_clear_uvec);
+	ret = task_work_add(r_info->upid_ctx->task, &r_info->twork, true);
+	if (ret) {
+		pr_debug("recv: Clear vector task=%d has already exited\n",
+			 r_info->upid_ctx->task->pid);
+		put_upid_ref(r_info->upid_ctx);
+		kfree(r_info);
+		return;
+	}
+}
+
+int do_uintr_register_vector(struct uintr_receiver_info *r_info)
+{
+	struct uintr_receiver *ui_recv;
+	struct task_struct *t = current;
+
+	/*
+	 * A vector should only be registered by a task that
+	 * has an interrupt handler registered.
+	 */
+	if (!is_uintr_receiver(t))
+		return -EINVAL;
+
+	if (r_info->uvec >= UINTR_MAX_UVEC_NR)
+		return -ENOSPC;
+
+	ui_recv = t->thread.ui_recv;
+
+	if (ui_recv->uvec_mask & BIT_ULL(r_info->uvec))
+		return -EBUSY;
+
+	ui_recv->uvec_mask |= BIT_ULL(r_info->uvec);
+	pr_debug("recv: task %d new uvec=%llu, new mask %llx\n",
+		 t->pid, r_info->uvec, ui_recv->uvec_mask);
+
+	r_info->upid_ctx = get_upid_ref(ui_recv->upid_ctx);
+
+	return 0;
+}
+
 /* Suppress notifications since this task is being context switched out */
 void switch_uintr_prepare(struct task_struct *prev)
 {
diff --git a/arch/x86/kernel/uintr_fd.c b/arch/x86/kernel/uintr_fd.c
index a1a9c105fdab..f0548bbac776 100644
--- a/arch/x86/kernel/uintr_fd.c
+++ b/arch/x86/kernel/uintr_fd.c
@@ -6,11 +6,105 @@ 
  */
 #define pr_fmt(fmt)	"uintr: " fmt
 
+#include <linux/anon_inodes.h>
+#include <linux/fdtable.h>
 #include <linux/sched.h>
 #include <linux/syscalls.h>
 
 #include <asm/uintr.h>
 
+struct uintrfd_ctx {
+	struct uintr_receiver_info *r_info;
+};
+
+#ifdef CONFIG_PROC_FS
+static void uintrfd_show_fdinfo(struct seq_file *m, struct file *file)
+{
+	struct uintrfd_ctx *uintrfd_ctx = file->private_data;
+
+	/* Check: Should we print the receiver and sender info here? */
+	seq_printf(m, "user_vector:%llu\n", uintrfd_ctx->r_info->uvec);
+}
+#endif
+
+static int uintrfd_release(struct inode *inode, struct file *file)
+{
+	struct uintrfd_ctx *uintrfd_ctx = file->private_data;
+
+	pr_debug("recv: Release uintrfd for r_task %d uvec %llu\n",
+		 uintrfd_ctx->r_info->upid_ctx->task->pid,
+		 uintrfd_ctx->r_info->uvec);
+
+	do_uintr_unregister_vector(uintrfd_ctx->r_info);
+	kfree(uintrfd_ctx);
+
+	return 0;
+}
+
+static const struct file_operations uintrfd_fops = {
+#ifdef CONFIG_PROC_FS
+	.show_fdinfo	= uintrfd_show_fdinfo,
+#endif
+	.release	= uintrfd_release,
+	.llseek		= noop_llseek,
+};
+
+/*
+ * sys_uintr_create_fd - Create a uintr_fd for the registered interrupt vector.
+ */
+SYSCALL_DEFINE2(uintr_create_fd, u64, vector, unsigned int, flags)
+{
+	struct uintrfd_ctx *uintrfd_ctx;
+	int uintrfd;
+	int ret;
+
+	if (!uintr_arch_enabled())
+		return -EOPNOTSUPP;
+
+	if (flags)
+		return -EINVAL;
+
+	uintrfd_ctx = kzalloc(sizeof(*uintrfd_ctx), GFP_KERNEL);
+	if (!uintrfd_ctx)
+		return -ENOMEM;
+
+	uintrfd_ctx->r_info = kzalloc(sizeof(*uintrfd_ctx->r_info), GFP_KERNEL);
+	if (!uintrfd_ctx->r_info) {
+		ret = -ENOMEM;
+		goto out_free_ctx;
+	}
+
+	uintrfd_ctx->r_info->uvec = vector;
+	ret = do_uintr_register_vector(uintrfd_ctx->r_info);
+	if (ret) {
+		kfree(uintrfd_ctx->r_info);
+		goto out_free_ctx;
+	}
+
+	/* TODO: Get user input for flags - UFD_CLOEXEC */
+	/* Check: Do we need O_NONBLOCK? */
+	uintrfd = anon_inode_getfd("[uintrfd]", &uintrfd_fops, uintrfd_ctx,
+				   O_RDONLY | O_CLOEXEC | O_NONBLOCK);
+
+	if (uintrfd < 0) {
+		ret = uintrfd;
+		goto out_free_uvec;
+	}
+
+	pr_debug("recv: Alloc vector success uintrfd %d uvec %llu for task=%d\n",
+		 uintrfd, uintrfd_ctx->r_info->uvec, current->pid);
+
+	return uintrfd;
+
+out_free_uvec:
+	do_uintr_unregister_vector(uintrfd_ctx->r_info);
+out_free_ctx:
+	kfree(uintrfd_ctx);
+	pr_debug("recv: Alloc vector failed for task=%d ret %d\n",
+		 current->pid, ret);
+	return ret;
+}
+
 /*
  * sys_uintr_register_handler - setup user interrupt handler for receiver.
  */