diff mbox series

[RFC,3/3] sched: User Mode Concurency Groups

Message ID 20211214205358.701701555@infradead.org (mailing list archive)
State New
Headers show
Series sched: User Managed Concurrency Groups | expand

Commit Message

Peter Zijlstra Dec. 14, 2021, 8:44 p.m. UTC
User Managed Concurrency Groups is an M:N threading toolkit that allows
constructing user space schedulers designed to efficiently manage
heterogeneous in-process workloads while maintaining high CPU
utilization (95%+).

XXX moar changelog explaining how this is moar awesome than
traditional user-space threading.

The big thing that's missing is the SMP wake-to-remote-idle.

The big assumption this whole thing is build on is that
pin_user_pages() preserves user mappings in so far that
pagefault_disable() will never generate EFAULT (unless the user does
munmap() in which case it can keep the pieces).

shrink_page_list() does page_maybe_dma_pinned() before try_to_unmap()
and as such seems to respect this constraint.

unmap_and_move() however seems willing to unmap otherwise pinned (and
hence unmigratable) pages. This might need fixing.

Originally-by: Peter Oskolkov <posk@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/Kconfig                       |    1 
 arch/x86/entry/syscalls/syscall_64.tbl |    3 
 arch/x86/include/asm/thread_info.h     |    2 
 fs/exec.c                              |    1 
 include/linux/entry-common.h           |    6 
 include/linux/sched.h                  |   86 +++
 include/linux/syscalls.h               |    4 
 include/linux/thread_info.h            |    2 
 include/uapi/asm-generic/unistd.h      |    9 
 include/uapi/linux/umcg.h              |  143 +++++
 init/Kconfig                           |   15 
 kernel/entry/common.c                  |   18 
 kernel/exit.c                          |    5 
 kernel/sched/Makefile                  |    1 
 kernel/sched/core.c                    |    9 
 kernel/sched/umcg.c                    |  868 +++++++++++++++++++++++++++++++++
 kernel/sys_ni.c                        |    5 
 17 files changed, 1171 insertions(+), 7 deletions(-)

Comments

Peter Oskolkov Dec. 21, 2021, 5:19 p.m. UTC | #1
On Tue, Dec 14, 2021 at 09:44:48PM +0100, Peter Zijlstra wrote:

I have looked at the code now in relative detail, and I have some comments
below. They are relatively minor, so I'm not against having
this merged, assuming we can work on remaining issues later.

And, of course, the higher-level question re: waking servers
on worker wakeup remains. The main concern, as I mention below,
is that my version of this ensures that no more than one worker
can be RUNNING per server, while this API essentially lets the
server spawn a new worker on each unblock event, resulting in many
workers running per single server. But we can wave this away as
the userspace problem/error, I guess.

I do need to test this patch(set) on a (semi)real workload to see
if all use cases are covered properly, so what I'll do next is
I'll hook this up to our userspace scheduling code and do extensive
testing (with appropriate tweaks to the kernel code). Due to the sheer
amount of userspace work this will entail, and the holidays, this will
take at least several weeks.

> User Managed Concurrency Groups is an M:N threading toolkit that allows
> constructing user space schedulers designed to efficiently manage
> heterogeneous in-process workloads while maintaining high CPU
> utilization (95%+).
>
> XXX moar changelog explaining how this is moar awesome than
> traditional user-space threading.
>
> The big thing that's missing is the SMP wake-to-remote-idle.

I think the SMP remote-wake can be easily emulated in the userspace:
when a server detects a wakeup due to a worker becoming
RUNNABLE, rather than the current worker becoming BLOCKED,
the server can do the wake-to-remote-idle thing in the userspace.

Yes, this will be slower than what I have in my version of the
patchset, but we can live with it for now, and do something later
if a benchmark shows something really compelling.

>
> The big assumption this whole thing is build on is that
> pin_user_pages() preserves user mappings in so far that
> pagefault_disable() will never generate EFAULT (unless the user does
> munmap() in which case it can keep the pieces).
>
> shrink_page_list() does page_maybe_dma_pinned() before try_to_unmap()
> and as such seems to respect this constraint.
>
> unmap_and_move() however seems willing to unmap otherwise pinned (and
> hence unmigratable) pages. This might need fixing.
>
> Originally-by: Peter Oskolkov <posk@google.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  arch/x86/Kconfig                       |    1
>  arch/x86/entry/syscalls/syscall_64.tbl |    3
>  arch/x86/include/asm/thread_info.h     |    2
>  fs/exec.c                              |    1
>  include/linux/entry-common.h           |    6
>  include/linux/sched.h                  |   86 +++
>  include/linux/syscalls.h               |    4
>  include/linux/thread_info.h            |    2
>  include/uapi/asm-generic/unistd.h      |    9
>  include/uapi/linux/umcg.h              |  143 +++++
>  init/Kconfig                           |   15
>  kernel/entry/common.c                  |   18
>  kernel/exit.c                          |    5
>  kernel/sched/Makefile                  |    1
>  kernel/sched/core.c                    |    9
>  kernel/sched/umcg.c                    |  868 +++++++++++++++++++++++++++++++++
>  kernel/sys_ni.c                        |    5
>  17 files changed, 1171 insertions(+), 7 deletions(-)
>
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -248,6 +248,7 @@ config X86
>	select HAVE_RSEQ
>	select HAVE_SYSCALL_TRACEPOINTS
>	select HAVE_UNSTABLE_SCHED_CLOCK
> +	select HAVE_UMCG			if X86_64
>	select HAVE_USER_RETURN_NOTIFIER
>	select HAVE_GENERIC_VDSO
>	select HOTPLUG_SMT			if SMP
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -371,6 +371,9 @@
>  447	common	memfd_secret		sys_memfd_secret
>  448	common	process_mrelease	sys_process_mrelease
>  449	common	futex_waitv		sys_futex_waitv
> +450	common	umcg_ctl		sys_umcg_ctl
> +451	common	umcg_wait		sys_umcg_wait
> +452	common	umcg_kick		sys_umcg_kick
>
>  #
>  # Due to a historical design error, certain syscalls are numbered differently
> --- a/arch/x86/include/asm/thread_info.h
> +++ b/arch/x86/include/asm/thread_info.h
> @@ -83,6 +83,7 @@ struct thread_info {
>  #define TIF_NEED_RESCHED	3	/* rescheduling necessary */
>  #define TIF_SINGLESTEP		4	/* reenable singlestep on user return*/
>  #define TIF_SSBD		5	/* Speculative store bypass disable */
> +#define TIF_UMCG		6	/* UMCG return to user hook */
>  #define TIF_SPEC_IB		9	/* Indirect branch speculation mitigation */
>  #define TIF_SPEC_L1D_FLUSH	10	/* Flush L1D on mm switches (processes) */
>  #define TIF_USER_RETURN_NOTIFY	11	/* notify kernel of userspace return */
> @@ -107,6 +108,7 @@ struct thread_info {
>  #define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
>  #define _TIF_SINGLESTEP		(1 << TIF_SINGLESTEP)
>  #define _TIF_SSBD		(1 << TIF_SSBD)
> +#define _TIF_UMCG		(1 << TIF_UMCG)
>  #define _TIF_SPEC_IB		(1 << TIF_SPEC_IB)
>  #define _TIF_SPEC_L1D_FLUSH	(1 << TIF_SPEC_L1D_FLUSH)
>  #define _TIF_USER_RETURN_NOTIFY	(1 << TIF_USER_RETURN_NOTIFY)
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -1838,6 +1838,7 @@ static int bprm_execve(struct linux_binp
>	current->fs->in_exec = 0;
>	current->in_execve = 0;
>	rseq_execve(current);
> +	umcg_execve(current);
>	acct_update_integrals(current);
>	task_numa_free(current, false);
>	return retval;
> --- a/include/linux/entry-common.h
> +++ b/include/linux/entry-common.h
> @@ -22,6 +22,10 @@
>  # define _TIF_UPROBE			(0)
>  #endif
>
> +#ifndef _TIF_UMCG
> +# define _TIF_UMCG			(0)
> +#endif
> +
>  /*
>   * SYSCALL_WORK flags handled in syscall_enter_from_user_mode()
>   */
> @@ -42,11 +46,13 @@
>				 SYSCALL_WORK_SYSCALL_EMU |		\
>				 SYSCALL_WORK_SYSCALL_AUDIT |		\
>				 SYSCALL_WORK_SYSCALL_USER_DISPATCH |	\
> +				 SYSCALL_WORK_SYSCALL_UMCG |		\
>				 ARCH_SYSCALL_WORK_ENTER)
>  #define SYSCALL_WORK_EXIT	(SYSCALL_WORK_SYSCALL_TRACEPOINT |	\
>				 SYSCALL_WORK_SYSCALL_TRACE |		\
>				 SYSCALL_WORK_SYSCALL_AUDIT |		\
>				 SYSCALL_WORK_SYSCALL_USER_DISPATCH |	\
> +				 SYSCALL_WORK_SYSCALL_UMCG |		\
>				 SYSCALL_WORK_SYSCALL_EXIT_TRAP	|	\
>				 ARCH_SYSCALL_WORK_EXIT)
>
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -67,6 +67,7 @@ struct sighand_struct;
>  struct signal_struct;
>  struct task_delay_info;
>  struct task_group;
> +struct umcg_task;
>
>  /*
>   * Task state bitmask. NOTE! These bits are also
> @@ -1294,6 +1295,23 @@ struct task_struct {
>	unsigned long rseq_event_mask;
>  #endif
>
> +#ifdef CONFIG_UMCG
> +	/* setup by sys_umcg_ctrl() */
> +	clockid_t		umcg_clock;
> +	struct umcg_task __user	*umcg_task;
> +
> +	/* setup by umcg_pin_enter() */
> +	struct page		*umcg_worker_page;
> +
> +	struct task_struct	*umcg_server;
> +	struct umcg_task __user *umcg_server_task;
> +	struct page		*umcg_server_page;
> +
> +	struct task_struct	*umcg_next;
> +	struct umcg_task __user	*umcg_next_task;
> +	struct page		*umcg_next_page;
> +#endif
> +
>	struct tlbflush_unmap_batch	tlb_ubc;
>
>	union {
> @@ -1687,6 +1705,13 @@ extern struct pid *cad_pid;
>  #define PF_KTHREAD		0x00200000	/* I am a kernel thread */
>  #define PF_RANDOMIZE		0x00400000	/* Randomize virtual address space */
>  #define PF_SWAPWRITE		0x00800000	/* Allowed to write to swap */
> +
> +#ifdef CONFIG_UMCG
> +#define PF_UMCG_WORKER		0x01000000	/* UMCG worker */
> +#else
> +#define PF_UMCG_WORKER		0x00000000
> +#endif
> +
>  #define PF_NO_SETAFFINITY	0x04000000	/* Userland is not allowed to meddle with cpus_mask */
>  #define PF_MCE_EARLY		0x08000000      /* Early kill for mce process policy */
>  #define PF_MEMALLOC_PIN		0x10000000	/* Allocation context constrained to zones which allow long term pinning. */
> @@ -2294,6 +2319,67 @@ static inline void rseq_execve(struct ta
>  {
>  }
>
> +#endif
> +
> +#ifdef CONFIG_UMCG
> +
> +extern void umcg_sys_enter(struct pt_regs *regs, long syscall);
> +extern void umcg_sys_exit(struct pt_regs *regs);
> +extern void umcg_notify_resume(struct pt_regs *regs);
> +extern void umcg_worker_exit(void);
> +extern void umcg_clear_child(struct task_struct *tsk);
> +
> +/* Called by bprm_execve() in fs/exec.c. */
> +static inline void umcg_execve(struct task_struct *tsk)
> +{
> +	if (tsk->umcg_task)
> +		umcg_clear_child(tsk);
> +}
> +
> +/* Called by do_exit() in kernel/exit.c. */
> +static inline void umcg_handle_exit(void)
> +{
> +	if (current->flags & PF_UMCG_WORKER)
> +		umcg_worker_exit();
> +}
> +
> +/*
> + * umcg_wq_worker_[sleeping|running] are called in core.c by
> + * sched_submit_work() and sched_update_worker().
> + */
> +extern void umcg_wq_worker_sleeping(struct task_struct *tsk);
> +extern void umcg_wq_worker_running(struct task_struct *tsk);
> +
> +#else  /* CONFIG_UMCG */
> +
> +static inline void umcg_sys_enter(struct pt_regs *regs, long syscall)
> +{
> +}
> +
> +static inline void umcg_sys_exit(struct pt_regs *regs)
> +{
> +}
> +
> +static inline void umcg_notify_resume(struct pt_regs *regs)
> +{
> +}
> +
> +static inline void umcg_clear_child(struct task_struct *tsk)
> +{
> +}
> +static inline void umcg_execve(struct task_struct *tsk)
> +{
> +}
> +static inline void umcg_handle_exit(void)
> +{
> +}
> +static inline void umcg_wq_worker_sleeping(struct task_struct *tsk)
> +{
> +}
> +static inline void umcg_wq_worker_running(struct task_struct *tsk)
> +{
> +}
> +
>  #endif
>
>  #ifdef CONFIG_DEBUG_RSEQ
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -72,6 +72,7 @@ struct open_how;
>  struct mount_attr;
>  struct landlock_ruleset_attr;
>  enum landlock_rule_type;
> +struct umcg_task;
>
>  #include <linux/types.h>
>  #include <linux/aio_abi.h>
> @@ -1057,6 +1058,9 @@ asmlinkage long sys_landlock_add_rule(in
>		const void __user *rule_attr, __u32 flags);
>  asmlinkage long sys_landlock_restrict_self(int ruleset_fd, __u32 flags);
>  asmlinkage long sys_memfd_secret(unsigned int flags);
> +asmlinkage long sys_umcg_ctl(u32 flags, struct umcg_task __user *self, clockid_t which_clock);
> +asmlinkage long sys_umcg_wait(u32 flags, u64 abs_timeout);
> +asmlinkage long sys_umcg_kick(u32 flags, pid_t tid);
>
>  /*
>   * Architecture-specific system calls
> --- a/include/linux/thread_info.h
> +++ b/include/linux/thread_info.h
> @@ -46,6 +46,7 @@ enum syscall_work_bit {
>	SYSCALL_WORK_BIT_SYSCALL_AUDIT,
>	SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH,
>	SYSCALL_WORK_BIT_SYSCALL_EXIT_TRAP,
> +	SYSCALL_WORK_BIT_SYSCALL_UMCG,
>  };
>
>  #define SYSCALL_WORK_SECCOMP		BIT(SYSCALL_WORK_BIT_SECCOMP)
> @@ -55,6 +56,7 @@ enum syscall_work_bit {
>  #define SYSCALL_WORK_SYSCALL_AUDIT	BIT(SYSCALL_WORK_BIT_SYSCALL_AUDIT)
>  #define SYSCALL_WORK_SYSCALL_USER_DISPATCH BIT(SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH)
>  #define SYSCALL_WORK_SYSCALL_EXIT_TRAP	BIT(SYSCALL_WORK_BIT_SYSCALL_EXIT_TRAP)
> +#define SYSCALL_WORK_SYSCALL_UMCG	BIT(SYSCALL_WORK_BIT_SYSCALL_UMCG)
>  #endif
>
>  #include <asm/thread_info.h>
> --- a/include/uapi/asm-generic/unistd.h
> +++ b/include/uapi/asm-generic/unistd.h
> @@ -883,8 +883,15 @@ __SYSCALL(__NR_process_mrelease, sys_pro
>  #define __NR_futex_waitv 449
>  __SYSCALL(__NR_futex_waitv, sys_futex_waitv)
>
> +#define __NR_umcg_ctl 450
> +__SYSCALL(__NR_umcg_ctl, sys_umcg_ctl)
> +#define __NR_umcg_wait 451
> +__SYSCALL(__NR_umcg_wait, sys_umcg_wait)
> +#define __NR_umcg_kick 452
> +__SYSCALL(__NR_umcg_kick, sys_umcg_kick)
> +
>  #undef __NR_syscalls
> -#define __NR_syscalls 450
> +#define __NR_syscalls 453
>
>  /*
>   * 32 bit systems traditionally used different
> --- /dev/null
> +++ b/include/uapi/linux/umcg.h
> @@ -0,0 +1,143 @@
> +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
> +#ifndef _UAPI_LINUX_UMCG_H
> +#define _UAPI_LINUX_UMCG_H
> +
> +#include <linux/types.h>
> +
> +/*
> + * UMCG: User Managed Concurrency Groups.
> + *
> + * Syscalls (see kernel/sched/umcg.c):
> + *      sys_umcg_ctl()  - register/unregister UMCG tasks;
> + *      sys_umcg_wait() - wait/wake/context-switch.
> + *      sys_umcg_kick() - prod a UMCG task
> + *
> + * struct umcg_task (below): controls the state of UMCG tasks.
> + */
> +
> +/*
> + * UMCG task states, the first 6 bits of struct umcg_task.state_ts.
> + * The states represent the user space point of view.
> + *
> + *   ,--------(TF_PREEMPT + notify_resume)-------. ,------------.
> + *   |                                           v |            |
> + * RUNNING -(schedule)-> BLOCKED -(sys_exit)-> RUNNABLE  (signal + notify_resume)
> + *   ^                                           | ^            |
> + *   `--------------(sys_umcg_wait)--------------' `------------'
> + *
> + */
> +#define UMCG_TASK_NONE			0x0000U
> +#define UMCG_TASK_RUNNING		0x0001U
> +#define UMCG_TASK_RUNNABLE		0x0002U
> +#define UMCG_TASK_BLOCKED		0x0003U
> +
> +#define UMCG_TASK_MASK			0x00ffU
> +
> +/*
> + * UMCG_TF_PREEMPT: userspace indicates the worker should be preempted.
> + *
> + * Must only be set on UMCG_TASK_RUNNING; once set, any subsequent
> + * return-to-user (eg sys_umcg_kick()) will perform the equivalent of
> + * sys_umcg_wait() on it. That is, it will wake next_tid/server_tid, transfer
> + * to RUNNABLE and enqueue on the server's runnable list.
> + */
> +#define UMCG_TF_PREEMPT			0x0100U
> +/*
> + * UMCG_TF_COND_WAIT: indicate the task *will* call sys_umcg_wait()
> + *
> + * Enables server loops like (vs umcg_sys_exit()):
> + *
> + *   for(;;) {
> + *	self->status = UMCG_TASK_RUNNABLE | UMCG_TF_COND_WAIT;
> + *	// smp_mb() implied by xchg()
> + *
> + *	runnable_ptr = xchg(self->runnable_workers_ptr, NULL);
> + *	while (runnable_ptr) {
> + *		next = runnable_ptr->runnable_workers_ptr;
> + *
> + *		umcg_server_add_runnable(self, runnable_ptr);
> + *
> + *		runnable_ptr = next;
> + *	}
> + *
> + *	self->next = umcg_server_pick_next(self);
> + *	sys_umcg_wait(0, 0);
> + *   }
> + *
> + * without a signal or interrupt in between setting umcg_task::state and
> + * sys_umcg_wait() resulting in an infinite wait in umcg_notify_resume().
> + */
> +#define UMCG_TF_COND_WAIT		0x0200U
> +
> +#define UMCG_TF_MASK			0xff00U
> +
> +#define UMCG_TASK_ALIGN			64
> +
> +/**
> + * struct umcg_task - controls the state of UMCG tasks.
> + *
> + * The struct is aligned at 64 bytes to ensure that it fits into
> + * a single cache line.
> + */
> +struct umcg_task {
> +	/**
> +	 * @state_ts: the current state of the UMCG task described by
> +	 *            this struct, with a unique timestamp indicating
> +	 *            when the last state change happened.
> +	 *
> +	 * Readable/writable by both the kernel and the userspace.
> +	 *
> +	 * UMCG task state:
> +	 *   bits  0 -  7: task state;
> +	 *   bits  8 - 15: state flags;
> +	 *   bits 16 - 31: for userspace use;
> +	 */
> +	__u32	state;				/* r/w */
> +
> +	/**
> +	 * @next_tid: the TID of the UMCG task that should be context-switched
> +	 *            into in sys_umcg_wait(). Can be zero, in which case
> +	 *            it'll switch to server_tid.
> +	 *
> +	 * @server_tid: the TID of the UMCG server that hosts this task,
> +	 *		when RUNNABLE this task will get added to it's
> +	 *		runnable_workers_ptr list.
> +	 *
> +	 * Read-only for the kernel, read/write for the userspace.
> +	 */
> +	__u32	next_tid;			/* r   */
> +	__u32	server_tid;			/* r   */
> +
> +	__u32	__hole[1];
> +
> +	/*
> +	 * Timestamps for when last we became BLOCKED, RUNNABLE, in CLOCK_MONOTONIC.
> +	 */
> +	__u64	blocked_ts;			/*   w */
> +	__u64   runnable_ts;			/*   w */
> +
> +	/**
> +	 * @runnable_workers_ptr: a single-linked list of runnable workers.
> +	 *
> +	 * Readable/writable by both the kernel and the userspace: the
> +	 * kernel adds items to the list, userspace removes them.
> +	 */
> +	__u64	runnable_workers_ptr;		/* r/w */
> +
> +	__u64	__zero[3];
> +
> +} __attribute__((packed, aligned(UMCG_TASK_ALIGN)));
> +
> +/**
> + * enum umcg_ctl_flag - flags to pass to sys_umcg_ctl
> + * @UMCG_CTL_REGISTER:   register the current task as a UMCG task
> + * @UMCG_CTL_UNREGISTER: unregister the current task as a UMCG task
> + * @UMCG_CTL_WORKER:     register the current task as a UMCG worker
> + */
> +enum umcg_ctl_flag {
> +	UMCG_CTL_REGISTER	= 0x00001,
> +	UMCG_CTL_UNREGISTER	= 0x00002,
> +	UMCG_CTL_WORKER		= 0x10000,
> +};
> +
> +#endif /* _UAPI_LINUX_UMCG_H */
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -1686,6 +1686,21 @@ config MEMBARRIER
>
>	  If unsure, say Y.
>
> +config HAVE_UMCG
> +	bool
> +
> +config UMCG
> +	bool "Enable User Managed Concurrency Groups API"
> +	depends on 64BIT
> +	depends on GENERIC_ENTRY
> +	depends on HAVE_UMCG
> +	default n
> +	help
> +	  Enable User Managed Concurrency Groups API, which form the basis
> +	  for an in-process M:N userspace scheduling framework.
> +	  At the moment this is an experimental/RFC feature that is not
> +	  guaranteed to be backward-compatible.
> +
>  config KALLSYMS
>	bool "Load all symbols for debugging/ksymoops" if EXPERT
>	default y
> --- a/kernel/entry/common.c
> +++ b/kernel/entry/common.c
> @@ -6,6 +6,7 @@
>  #include <linux/livepatch.h>
>  #include <linux/audit.h>
>  #include <linux/tick.h>
> +#include <linux/sched.h>
>
>  #include "common.h"
>
> @@ -76,6 +77,9 @@ static long syscall_trace_enter(struct p
>	if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT))
>		trace_sys_enter(regs, syscall);
>
> +	if (work & SYSCALL_WORK_SYSCALL_UMCG)
> +		umcg_sys_enter(regs, syscall);
> +
>	syscall_enter_audit(regs, syscall);
>
>	return ret ? : syscall;
> @@ -155,8 +159,7 @@ static unsigned long exit_to_user_mode_l
>	 * Before returning to user space ensure that all pending work
>	 * items have been completed.
>	 */
> -	while (ti_work & EXIT_TO_USER_MODE_WORK) {
> -
> +	do {
>		local_irq_enable_exit_to_user(ti_work);
>
>		if (ti_work & _TIF_NEED_RESCHED)
> @@ -168,6 +171,10 @@ static unsigned long exit_to_user_mode_l
>		if (ti_work & _TIF_PATCH_PENDING)
>			klp_update_patch_state(current);
>
> +		/* must be before handle_signal_work(); terminates on sigpending */
> +		if (ti_work & _TIF_UMCG)
> +			umcg_notify_resume(regs);
> +
>		if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
>			handle_signal_work(regs, ti_work);
>
> @@ -188,7 +195,7 @@ static unsigned long exit_to_user_mode_l
>		tick_nohz_user_enter_prepare();
>
>		ti_work = read_thread_flags();
> -	}
> +	} while (ti_work & EXIT_TO_USER_MODE_WORK);
>
>	/* Return the latest work state for arch_exit_to_user_mode() */
>	return ti_work;
> @@ -203,7 +210,7 @@ static void exit_to_user_mode_prepare(st
>	/* Flush pending rcuog wakeup before the last need_resched() check */
>	tick_nohz_user_enter_prepare();
>
> -	if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
> +	if (unlikely(ti_work & (EXIT_TO_USER_MODE_WORK | _TIF_UMCG)))
>		ti_work = exit_to_user_mode_loop(regs, ti_work);
>
>	arch_exit_to_user_mode_prepare(regs, ti_work);
> @@ -253,6 +260,9 @@ static void syscall_exit_work(struct pt_
>	step = report_single_step(work);
>	if (step || work & SYSCALL_WORK_SYSCALL_TRACE)
>		arch_syscall_exit_tracehook(regs, step);
> +
> +	if (work & SYSCALL_WORK_SYSCALL_UMCG)
> +		umcg_sys_exit(regs);
>  }
>
>  /*
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -749,6 +749,10 @@ void __noreturn do_exit(long code)
>	if (unlikely(!tsk->pid))
>		panic("Attempted to kill the idle task!");
>
> +	/* Turn off UMCG sched hooks. */
> +	if (unlikely(tsk->flags & PF_UMCG_WORKER))
> +		tsk->flags &= ~PF_UMCG_WORKER;
> +
>	/*
>	 * If do_exit is called because this processes oopsed, it's possible
>	 * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
> @@ -786,6 +790,7 @@ void __noreturn do_exit(long code)
>
>	io_uring_files_cancel();
>	exit_signals(tsk);  /* sets PF_EXITING */
> +	umcg_handle_exit();
>
>	/* sync mm's RSS info before statistics gathering */
>	if (tsk->mm)
> --- a/kernel/sched/Makefile
> +++ b/kernel/sched/Makefile
> @@ -41,3 +41,4 @@ obj-$(CONFIG_MEMBARRIER) += membarrier.o
>  obj-$(CONFIG_CPU_ISOLATION) += isolation.o
>  obj-$(CONFIG_PSI) += psi.o
>  obj-$(CONFIG_SCHED_CORE) += core_sched.o
> +obj-$(CONFIG_UMCG) += umcg.o
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -4272,6 +4272,7 @@ static void __sched_fork(unsigned long c
>	p->wake_entry.u_flags = CSD_TYPE_TTWU;
>	p->migration_pending = NULL;
>  #endif
> +	umcg_clear_child(p);
>  }
>
>  DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
> @@ -6330,9 +6331,11 @@ static inline void sched_submit_work(str
>	 * If a worker goes to sleep, notify and ask workqueue whether it
>	 * wants to wake up a task to maintain concurrency.
>	 */
> -	if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
> +	if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER | PF_UMCG_WORKER)) {
>		if (task_flags & PF_WQ_WORKER)
>			wq_worker_sleeping(tsk);
> +		else if (task_flags & PF_UMCG_WORKER)
> +			umcg_wq_worker_sleeping(tsk);
>		else
>			io_wq_worker_sleeping(tsk);
>	}
> @@ -6350,9 +6353,11 @@ static inline void sched_submit_work(str
>
>  static void sched_update_worker(struct task_struct *tsk)
>  {
> -	if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
> +	if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER | PF_UMCG_WORKER)) {
>		if (tsk->flags & PF_WQ_WORKER)
>			wq_worker_running(tsk);
> +		else if (tsk->flags & PF_UMCG_WORKER)
> +			umcg_wq_worker_running(tsk);
>		else
>			io_wq_worker_running(tsk);
>	}
> --- /dev/null
> +++ b/kernel/sched/umcg.c
> @@ -0,0 +1,868 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +
> +/*
> + * User Managed Concurrency Groups (UMCG).
> + *
> + */
> +
> +#include <linux/syscalls.h>
> +#include <linux/types.h>
> +#include <linux/uaccess.h>
> +#include <linux/umcg.h>
> +
> +#include <asm/syscall.h>
> +
> +#include "sched.h"
> +
> +static struct task_struct *umcg_get_task(u32 tid)
> +{
> +	struct task_struct *tsk = NULL;
> +
> +	if (tid) {
> +		rcu_read_lock();
> +		tsk = find_task_by_vpid(tid);
> +		if (tsk && current->mm == tsk->mm && tsk->umcg_task)

This essentially limits all operations to a single mm/process.
Fine for now, but a fast remote context switch is also a valid use
case. It is not directly related to userspace scheduling, so I'm
just mentioning it here. Maybe server-to-server cross-process
context switches should be allowed for the same user/cgroup? (Again,
this is for later to consider).

> +			get_task_struct(tsk);
> +		else
> +			tsk = NULL;
> +		rcu_read_unlock();
> +	}
> +
> +	return tsk;
> +}
> +
> +/**
> + * umcg_pin_pages: pin pages containing struct umcg_task of
> + *		   this task, its server (possibly this task again)
> + *		   and the next (possibly NULL).
> + */
> +static int umcg_pin_pages(void)
> +{
> +	struct task_struct *server = NULL, *next = NULL, *tsk = current;
> +	struct umcg_task __user *self = READ_ONCE(tsk->umcg_task);
> +	int server_tid, next_tid;
> +	int ret;
> +
> +	/* must not have stale state */
> +	if (WARN_ON_ONCE(tsk->umcg_worker_page ||
> +			 tsk->umcg_server_page ||
> +			 tsk->umcg_next_page   ||
> +			 tsk->umcg_server_task ||
> +			 tsk->umcg_next_task   ||
> +			 tsk->umcg_server      ||
> +			 tsk->umcg_next))
> +		return -EBUSY;
> +
> +	ret = -EFAULT;
> +	if (pin_user_pages_fast((unsigned long)self, 1, 0,
> +				&tsk->umcg_worker_page) != 1)
> +		goto clear_self;
> +
> +	if (get_user(server_tid, &self->server_tid))
> +		goto unpin_self;
> +
> +	ret = -ESRCH;
> +	server = umcg_get_task(server_tid);
> +	if (!server)
> +		goto unpin_self;
> +
> +	ret = -EFAULT;
> +	/* must cache due to possible concurrent change vs access_ok() */
> +	tsk->umcg_server_task = READ_ONCE(server->umcg_task);
> +	if (pin_user_pages_fast((unsigned long)tsk->umcg_server_task, 1, 0,
> +				&tsk->umcg_server_page) != 1)
> +		goto clear_server;
> +
> +	tsk->umcg_server = server;
> +
> +	if (get_user(next_tid, &self->next_tid))
> +		goto unpin_server;
> +
> +	if (!next_tid)
> +		goto done;
> +
> +	ret = -ESRCH;
> +	next = umcg_get_task(next_tid);
> +	if (!next)
> +		goto unpin_server;
> +
> +	ret = -EFAULT;
> +	tsk->umcg_next_task = READ_ONCE(next->umcg_task);
> +	if (pin_user_pages_fast((unsigned long)tsk->umcg_next_task, 1, 0,
> +				&tsk->umcg_next_page) != 1)
> +		goto clear_next;
> +
> +	tsk->umcg_next = next;
> +
> +done:
> +	return 0;
> +
> +clear_next:
> +	tsk->umcg_next_task = NULL;
> +	tsk->umcg_next_page = NULL;
> +
> +unpin_server:
> +	unpin_user_page(tsk->umcg_server_page);
> +
> +clear_server:
> +	tsk->umcg_server_task = NULL;
> +	tsk->umcg_server_page = NULL;
> +
> +unpin_self:
> +	unpin_user_page(tsk->umcg_worker_page);
> +clear_self:
> +	tsk->umcg_worker_page = NULL;
> +
> +	return ret;
> +}
> +
> +static void umcg_unpin_pages(void)
> +{
> +	struct task_struct *tsk = current;
> +
> +	if (tsk->umcg_server) {
> +		unpin_user_page(tsk->umcg_worker_page);
> +		tsk->umcg_worker_page = NULL;
> +
> +		unpin_user_page(tsk->umcg_server_page);
> +		tsk->umcg_server_page = NULL;
> +		tsk->umcg_server_task = NULL;
> +
> +		put_task_struct(tsk->umcg_server);
> +		tsk->umcg_server = NULL;
> +
> +		if (tsk->umcg_next) {
> +			unpin_user_page(tsk->umcg_next_page);
> +			tsk->umcg_next_page = NULL;
> +			tsk->umcg_next_task = NULL;
> +
> +			put_task_struct(tsk->umcg_next);
> +			tsk->umcg_next = NULL;
> +		}
> +	}
> +}
> +
> +static void umcg_clear_task(struct task_struct *tsk)
> +{
> +	/*
> +	 * This is either called for the current task, or for a newly forked
> +	 * task that is not yet running, so we don't need strict atomicity
> +	 * below.
> +	 */
> +	if (tsk->umcg_task) {
> +		WRITE_ONCE(tsk->umcg_task, NULL);
> +		tsk->umcg_worker_page = NULL;
> +
> +		tsk->umcg_server = NULL;
> +		tsk->umcg_server_page = NULL;
> +		tsk->umcg_server_task = NULL;
> +
> +		tsk->umcg_next = NULL;
> +		tsk->umcg_next_page = NULL;
> +		tsk->umcg_next_task = NULL;
> +
> +		tsk->flags &= ~PF_UMCG_WORKER;
> +		clear_task_syscall_work(tsk, SYSCALL_UMCG);
> +		clear_tsk_thread_flag(tsk, TIF_UMCG);
> +	}
> +}
> +
> +/* Called for a forked or execve-ed child. */
> +void umcg_clear_child(struct task_struct *tsk)
> +{
> +	umcg_clear_task(tsk);
> +}
> +
> +/* Called both by normally (unregister) and abnormally exiting workers. */
> +void umcg_worker_exit(void)
> +{
> +	umcg_unpin_pages();
> +	umcg_clear_task(current);
> +}
> +
> +/*
> + * Do a state transition: @from -> @to.
> + *
> + * Will clear UMCG_TF_PREEMPT, UMCG_TF_COND_WAIT.
> + *
> + * When @to == {BLOCKED,RUNNABLE}, update timestamps.
> + *
> + * Returns:
> + *   0: success
> + *   -EAGAIN: when self->state != @from
> + *   -EFAULT
> + */
> +static int umcg_update_state(struct task_struct *tsk,
> +			     struct umcg_task __user *self,
> +			     u32 from, u32 to)
> +{
> +	u32 old, new;
> +	u64 now;
> +
> +	if (to >= UMCG_TASK_RUNNABLE) {
> +		switch (tsk->umcg_clock) {
> +		case CLOCK_REALTIME:      now = ktime_get_real_ns();     break;
> +		case CLOCK_MONOTONIC:     now = ktime_get_ns();          break;
> +		case CLOCK_BOOTTIME:      now = ktime_get_boottime_ns(); break;
> +		case CLOCK_TAI:           now = ktime_get_clocktai_ns(); break;
> +		}
> +	}
> +
> +	if (!user_access_begin(self, sizeof(*self)))
> +		return -EFAULT;
> +
> +	unsafe_get_user(old, &self->state, Efault);
> +	do {
> +		if ((old & UMCG_TASK_MASK) != from)
> +			goto fail;
> +
> +		new = old & ~(UMCG_TASK_MASK |
> +			      UMCG_TF_PREEMPT | UMCG_TF_COND_WAIT);
> +		new |= to & UMCG_TASK_MASK;
> +
> +	} while (!unsafe_try_cmpxchg_user(&self->state, &old, new, Efault));
> +
> +	if (to == UMCG_TASK_BLOCKED)
> +		unsafe_put_user(now, &self->blocked_ts, Efault);
> +	if (to == UMCG_TASK_RUNNABLE)
> +		unsafe_put_user(now, &self->runnable_ts, Efault);
> +
> +	user_access_end();
> +	return 0;
> +
> +fail:
> +	user_access_end();
> +	return -EAGAIN;
> +
> +Efault:
> +	user_access_end();
> +	return -EFAULT;
> +}
> +
> +#define __UMCG_DIE(stmt, reason)	do {				\
> +	stmt;								\
> +	pr_warn_ratelimited("%s: killing task %s/%d because: " reason "\n",\
> +			    __func__, current->comm, current->pid);	\
> +	force_sig(SIGKILL);						\
> +	return;								\
> +} while (0)
> +
> +#define UMCG_DIE(reason)	__UMCG_DIE(,reason)
> +#define UMCG_DIE_PF(reason)	__UMCG_DIE(pagefault_enable(), reason)
> +#define UMCG_DIE_UNPIN(reason)	__UMCG_DIE(umcg_unpin_pages(), reason)
> +
> +/* Called from syscall enter path */
> +void umcg_sys_enter(struct pt_regs *regs, long syscall)
> +{
> +	/* avoid recursion vs our own syscalls */
> +	if (syscall == __NR_umcg_wait ||
> +	    syscall == __NR_umcg_ctl)
> +		return;
> +
> +	/* avoid recursion vs schedule() */
> +	current->flags &= ~PF_UMCG_WORKER;
> +
> +	/*
> +	 * Pin all the state on sys_enter() such that we can rely on it
> +	 * from dodgy contexts. It is either unpinned from pre-schedule()
> +	 * or sys_exit(), whichever comes first, thereby ensuring the pin
> +	 * is temporary.
> +	 */
> +	if (umcg_pin_pages())
> +		UMCG_DIE("pin");
> +
> +	current->flags |= PF_UMCG_WORKER;
> +}
> +
> +static int umcg_wake_task(struct task_struct *tsk, struct umcg_task __user *self)
> +{
> +	int ret = umcg_update_state(tsk, self, UMCG_TASK_RUNNABLE, UMCG_TASK_RUNNING);
> +	if (ret)
> +		return ret;
> +
> +	try_to_wake_up(tsk, TASK_NORMAL, WF_CURRENT_CPU);
> +	return 0;
> +}
> +
> +static int umcg_wake_next(struct task_struct *tsk)
> +{
> +	int ret = umcg_wake_task(tsk->umcg_next, tsk->umcg_next_task);
> +	if (ret)
> +		return ret;
> +
> +	/*
> +	 * If userspace sets umcg_task::next_tid, it needs to remove
> +	 * that task from the ready-queue to avoid another server
> +	 * selecting it. However, that also means it needs to put it
> +	 * back in case it went unused.
> +	 *
> +	 * By clearing the field on use, userspace can detect this case
> +	 * and DTRT.
> +	 */
> +	if (put_user(0u, &tsk->umcg_task->next_tid))
> +		return -EFAULT;
> +
> +	return 0;
> +}
> +
> +static int umcg_wake_server(struct task_struct *tsk)
> +{
> +	int ret = umcg_wake_task(tsk->umcg_server, tsk->umcg_server_task);
> +	switch (ret) {
> +	case 0:
> +	case -EAGAIN:
> +		/*
> +		 * Server could have timed-out or already be running
> +		 * due to a runnable enqueue. See umcg_sys_exit().
> +		 */
> +		break;
> +
> +	default:
> +		return ret;
> +	}
> +
> +	return 0;
> +}
> +
> +/*
> + * Wake ::next_tid or ::server_tid.
> + *
> + * Must be called in umcg_pin_pages() context, relies on
> + * tsk->umcg_{server,next}.
> + *
> + * Returns:
> + *   0: success
> + *   -EAGAIN
> + *   -EFAULT
> + */
> +static int umcg_wake(struct task_struct *tsk)
> +{
> +	if (tsk->umcg_next)
> +		return umcg_wake_next(tsk);
> +
> +	return umcg_wake_server(tsk);
> +}
> +
> +/* pre-schedule() */
> +void umcg_wq_worker_sleeping(struct task_struct *tsk)
> +{
> +	struct umcg_task __user *self = READ_ONCE(tsk->umcg_task);
> +
> +	/* Must not fault, mmap_sem might be held. */
> +	pagefault_disable();
> +
> +	if (WARN_ON_ONCE(!tsk->umcg_server))
> +		UMCG_DIE_PF("no server");

We can get here if a running worker (no pinned pages) gets a pagefault
in the userspace. Is umcg_sys_enter() called for pagefaults? If not,
we should not kill the worker; also the userspace won't be able to
detect this worker blocking on a pagefault...

Why don't you like my approach of pinning pages on exit_to_userspace
and unpinning on going to sleep? Yes, the pins will last longer,
but only for scheduled on CPU tasks, so bounded both by time and number
(of course, if umcg_sys_enter() is called on pagefaults/signals/other
interrupts, pinning in umcg_sys_enter() is better).

On the other hand, doing nothing on pagefaults and similar, and having
to only worry about blocking in syscalls, does make things much simpler
(no unexpected concurrency and such). I think most of the things
you found complicated in my patchset, other than the SMP remote-idle wakeup,
were driven by making sure spurious pagefaults are properly handled.

I can't tell now whether keeping workers RUNNING during pagefaults
vs waking their servers to run pending workers is a net gain or loss
re: performance. I'll have to benchmark this when my large test is ready.

> +
> +	if (umcg_update_state(tsk, self, UMCG_TASK_RUNNING, UMCG_TASK_BLOCKED))
> +		UMCG_DIE_PF("state");
> +
> +	if (umcg_wake(tsk))
> +		UMCG_DIE_PF("wake");

So this works _if_ pagefaults do not go here, as a task switching to
next will do so in sys_umcg_wait and thus elide this code. But if pagefaults
do trigger this code, this is wrong, as the server should be woken
on a pagefault, not next: the worker setting next_tid will then
call sys_umcg_wait(), expecting to context-switch.

So: if pagefaults lead here via umcg_sys_enter(), we should wake the
server here, not next. If pagefaults do not trigger umcg_sys_enter(),
then we should not kill the task above with "no server".

> +
> +	pagefault_enable();
> +
> +	/*
> +	 * We're going to sleep, make sure to unpin the pages, this ensures
> +	 * the pins are temporary. Also see umcg_sys_exit().
> +	 */
> +	umcg_unpin_pages();
> +}
> +
> +/* post-schedule() */
> +void umcg_wq_worker_running(struct task_struct *tsk)
> +{
> +	/* nothing here, see umcg_sys_exit() */
> +}
> +
> +/*
> + * Enqueue @tsk on it's server's runnable list
> + *
> + * Must be called in umcg_pin_pages() context, relies on tsk->umcg_server.
> + *
> + * cmpxchg based single linked list add such that list integrity is never
> + * violated.  Userspace *MUST* remove it from the list before changing ->state.
> + * As such, we must change state to RUNNABLE before enqueue.
> + *
> + * Returns:
> + *   0: success
> + *   -EFAULT
> + */
> +static int umcg_enqueue_runnable(struct task_struct *tsk)
> +{
> +	struct umcg_task __user *server = tsk->umcg_server_task;
> +	struct umcg_task __user *self = tsk->umcg_task;
> +	u64 self_ptr = (unsigned long)self;
> +	u64 first_ptr;
> +
> +	/*
> +	 * umcg_pin_pages() did access_ok() on both pointers, use self here
> +	 * only because __user_access_begin() isn't available in generic code.
> +	 */
> +	if (!user_access_begin(self, sizeof(*self)))
> +		return -EFAULT;
> +
> +	unsafe_get_user(first_ptr, &server->runnable_workers_ptr, Efault);
> +	do {
> +		unsafe_put_user(first_ptr, &self->runnable_workers_ptr, Efault);
> +	} while (!unsafe_try_cmpxchg_user(&server->runnable_workers_ptr, &first_ptr, self_ptr, Efault));
> +
> +	user_access_end();
> +	return 0;
> +
> +Efault:
> +	user_access_end();
> +	return -EFAULT;
> +}
> +
> +/*
> + * umcg_wait: Wait for ->state to become RUNNING
> + *
> + * Returns:
> + * 0		- success
> + * -EINTR	- pending signal
> + * -EINVAL	- ::state is not {RUNNABLE,RUNNING}
> + * -ETIMEDOUT
> + * -EFAULT
> + */
> +int umcg_wait(u64 timo)
> +{
> +	struct task_struct *tsk = current;
> +	struct umcg_task __user *self = tsk->umcg_task;
> +	struct page *page = NULL;
> +	u32 state;
> +	int ret;
> +
> +	for (;;) {
> +		set_current_state(TASK_INTERRUPTIBLE);
> +
> +		ret = -EINTR;
> +		if (signal_pending(current))
> +			break;
> +
> +		/*
> +		 * Faults can block and scribble our wait state.
> +		 */
> +		pagefault_disable();
> +		if (get_user(state, &self->state)) {
> +			pagefault_enable();
> +
> +			ret = -EFAULT;
> +			if (page) {
> +				unpin_user_page(page);
> +				page = NULL;
> +				break;
> +			}
> +
> +			if (pin_user_pages_fast((unsigned long)self, 1, 0, &page) != 1) {

I believe that the task should not be TASK_INTERRUPTIBLE here,
as pin_user_pages_fast may fault, and might_fault complains via __might_sleep.

> +				page = NULL;
> +				break;
> +			}
> +
> +			continue;
> +		}
> +
> +		if (page) {
> +			unpin_user_page(page);
> +			page = NULL;
> +		}
> +		pagefault_enable();
> +
> +		state &= UMCG_TASK_MASK;
> +		if (state != UMCG_TASK_RUNNABLE) {
> +			ret = 0;
> +			if (state == UMCG_TASK_RUNNING)
> +				break;
> +
> +			ret = -EINVAL;
> +			break;
> +		}
> +
> +		if (!schedule_hrtimeout_range_clock(timo ? &timo : NULL,
> +						    tsk->timer_slack_ns,
> +						    HRTIMER_MODE_ABS,
> +						    tsk->umcg_clock)) {
> +			ret = -ETIMEDOUT;
> +			break;
> +		}
> +	}
> +	__set_current_state(TASK_RUNNING);
> +
> +	return ret;
> +}
> +
> +void umcg_sys_exit(struct pt_regs *regs)
> +{
> +	struct task_struct *tsk = current;
> +	struct umcg_task __user *self = READ_ONCE(tsk->umcg_task);
> +	long syscall = syscall_get_nr(tsk, regs);
> +
> +	if (syscall == __NR_umcg_wait)
> +		return;
> +
> +	/*
> +	 * sys_umcg_ctl() will get here without having called umcg_sys_enter()
> +	 * as such it will look like a syscall that blocked.
> +	 */
> +
> +	if (tsk->umcg_server) {
> +		/*
> +		 * Didn't block, we done.
> +		 */
> +		umcg_unpin_pages();
> +		return;
> +	}
> +
> +	/* avoid recursion vs schedule() */
> +	current->flags &= ~PF_UMCG_WORKER;
> +
> +	if (umcg_pin_pages())
> +		UMCG_DIE("pin");
> +
> +	if (umcg_update_state(tsk, self, UMCG_TASK_BLOCKED, UMCG_TASK_RUNNABLE))
> +		UMCG_DIE_UNPIN("state");
> +
> +	if (umcg_enqueue_runnable(tsk))
> +		UMCG_DIE_UNPIN("enqueue");
> +
> +	/* Server might not be RUNNABLE, means it's already running */
> +	if (umcg_wake_server(tsk))
> +		UMCG_DIE_UNPIN("wake-server");

So this here breaks the assumption that servers+workers never run
on more CPUs than the number of servers, which I've gone through
a lot of pain to ensure in my patchset.

I think the assumption is based on the idea that a process
using UMCG will get affined to N CPUs, will have N servers and
a number of workers, and they will all happily cooperate and not
get any extra threads running.

Of course the pretty picture was not completely true, as the unblocked
tasks do consume extra threads in the kernel, though never in the
userspace.

So this patch may result in all servers running due to wakeups
in umcg_sys_exit(), with also their currently designated workers
running as well, so the userspace may see N+N running threads.

For now I think this may be OK, but as I mentioned above, I need to
run a larger test with a real workload to see if anything is missing.

What does worry me is that in this wakeup the server calls sys_umcg_wait()
with another worker in next_tid, so now the server will have two
workers running: the current kernel API seems to allow this to happen.
In my patchset the invariant that no more than one worker running
per server was enforced by the kernel.

> +
> +	umcg_unpin_pages();
> +
> +	switch (umcg_wait(0)) {
> +	case -EFAULT:
> +	case -EINVAL:
> +	case -ETIMEDOUT: /* how!?! */
> +	default:

This "default" coming before "case 0" below feels weird... can we do

	switch (umcg_wait()) {
	case 0:
	case -EINTR:
		/* ... */
		break;
	default:
		UMCG_DIE("wait");
	}

> +		UMCG_DIE("wait");
> +
> +	case 0:
> +	case -EINTR:
> +		/* notify_resume will continue the wait after the signal */
> +		break;
> +	}
> +
> +	current->flags |= PF_UMCG_WORKER;
> +}
> +
> +void umcg_notify_resume(struct pt_regs *regs)
> +{
> +	struct task_struct *tsk = current;
> +	struct umcg_task __user *self = tsk->umcg_task;
> +	bool worker = tsk->flags & PF_UMCG_WORKER;
> +	u32 state;
> +
> +	/* avoid recursion vs schedule() */
> +	if (worker)
> +		current->flags &= ~PF_UMCG_WORKER;
> +
> +	if (get_user(state, &self->state))
> +		UMCG_DIE("get-state");
> +
> +	state &= UMCG_TASK_MASK | UMCG_TF_MASK;
> +	if (state == UMCG_TASK_RUNNING)
> +		goto done;
> +
> +	/*
> +	 * See comment at UMCG_TF_COND_WAIT; TL;DR: user *will* call
> +	 * sys_umcg_wait() and signals/interrupts shouldn't block
> +	 * return-to-user.
> +	 */
> +	if (state == (UMCG_TASK_RUNNABLE | UMCG_TF_COND_WAIT))
> +		goto done;
> +
> +	if (state & UMCG_TF_PREEMPT) {
> +		if (umcg_pin_pages())
> +			UMCG_DIE("pin");
> +
> +		if (umcg_update_state(tsk, self,
> +				      UMCG_TASK_RUNNING,
> +				      UMCG_TASK_RUNNABLE))
> +			UMCG_DIE_UNPIN("state");
> +
> +		if (umcg_enqueue_runnable(tsk))
> +			UMCG_DIE_UNPIN("enqueue");
> +
> +		/*
> +		 * XXX do we want a preemption consuming ::next_tid ?
> +		 * I'm currently leaning towards no.

I don't think so: preemption is a sched-type event, so a server
should handle it; next_tid has nothing to do with it.

> +		 */
> +		if (umcg_wake_server(tsk))
> +			UMCG_DIE_UNPIN("wake-server");
> +
> +		umcg_unpin_pages();
> +	}
> +
> +	switch (umcg_wait(0)) {
> +	case -EFAULT:
> +	case -EINVAL:
> +	case -ETIMEDOUT: /* how!?! */
> +	default:
> +		UMCG_DIE("wait");

Same suggestion re: putting case 0/-EINTR first.

> +
> +	case 0:
> +	case -EINTR:
> +		/* we'll will resume the wait after the signal */
> +		break;
> +	}
> +
> +done:
> +	if (worker)
> +		current->flags |= PF_UMCG_WORKER;
> +}
> +
> +/**
> + * sys_umcg_kick: makes a UMCG task cycle through umcg_notify_resume()
> + *
> + * Returns:
> + * 0		- Ok;
> + * -ESRCH	- not a related UMCG task
> + * -EINVAL	- another error happened (unknown flags, etc..)
> + */
> +SYSCALL_DEFINE2(umcg_kick, u32, flags, pid_t, tid)
> +{
> +	struct task_struct *task = umcg_get_task(tid);
> +	if (!task)
> +		return -ESRCH;
> +
> +	if (flags)
> +		return -EINVAL;
> +
> +#ifdef CONFIG_SMP
> +	smp_send_reschedule(task_cpu(task));
> +#endif
> +
> +	return 0;
> +}
> +
> +/**
> + * sys_umcg_wait: transfer running context
> + *
> + * Block until RUNNING. Userspace must already set RUNNABLE to deal with the
> + * sleep condition races (see TF_COND_WAIT).
> + *
> + * Will wake either ::next_tid or ::server_tid to take our place. If this is a
> + * server then not setting ::next_tid will wake self.
> + *
> + * Returns:
> + * 0		- OK;
> + * -ETIMEDOUT	- the timeout expired;
> + * -ERANGE	- the timeout is out of range (worker);
> + * -EAGAIN	- ::state wasn't RUNNABLE, concurrent wakeup;
> + * -EFAULT	- failed accessing struct umcg_task __user of the current
> + *		  task, the server or next;
> + * -ESRCH	- the task to wake not found or not a UMCG task;
> + * -EINVAL	- another error happened (e.g. the current task is not a
> + *		  UMCG task, etc.)
> + */
> +SYSCALL_DEFINE2(umcg_wait, u32, flags, u64, timo)
> +{
> +	struct task_struct *tsk = current;
> +	struct umcg_task __user *self = READ_ONCE(tsk->umcg_task);
> +	bool worker = tsk->flags & PF_UMCG_WORKER;
> +	int ret;
> +
> +	if (!self || flags)
> +		return -EINVAL;
> +
> +	if (worker) {
> +		tsk->flags &= ~PF_UMCG_WORKER;
> +		if (timo)
> +			return -ERANGE;

Worker sleeps timing out is a valid and a real use case. Similar
to futex timeouts, mutex timeouts, condvar timeouts. I do not believe
there is a fundamental problem here, so I'll add worker timeout
handling in my larger test.

In addition, shouldn't we NOT clear PF_UMCG_WORKER flag if we
return an error?

> +	}
> +
> +	/* see umcg_sys_{enter,exit}() syscall exceptions */
> +	ret = umcg_pin_pages();

I do not think we need to pin pages for servers, only for workers. Yes,
this makes things easier/simpler, so ok for now, but maybe later we will
need to be a bit more fine-grained here.

> +	if (ret)
> +		goto unblock;
> +
> +	/*
> +	 * Clear UMCG_TF_COND_WAIT *and* check state == RUNNABLE.
> +	 */
> +	ret = umcg_update_state(tsk, self, UMCG_TASK_RUNNABLE, UMCG_TASK_RUNNABLE);
> +	if (ret)
> +		goto unpin;
> +
> +	if (worker) {
> +		ret = umcg_enqueue_runnable(tsk);
> +		if (ret)
> +			goto unpin;
> +	}
> +
> +	if (worker)

Should this "if" be merged with the one above?

> +		ret = umcg_wake(tsk);
> +	else if (tsk->umcg_next)
> +		ret = umcg_wake_next(tsk);
> +
> +	if (ret) {
> +		/*
> +		 * XXX already enqueued ourself on ::server_tid; failing now
> +		 * leaves the lot in an inconsistent state since it'll also
> +		 * unblock self in order to return the error. !?!?
> +		 */

It looks like only EFAULT can be here. I'd ensure that, and then just DIE.

> +		goto unpin;
> +	}
> +
> +	umcg_unpin_pages();
> +
> +	ret = umcg_wait(timo);
> +	switch (ret) {
> +	case 0:		/* all done */
> +	case -EINTR:	/* umcg_notify_resume() will continue the wait */
> +		ret = 0;
> +		break;

Why not let workers have timeouts, and keep -ETIMEDOUT here? Just set
UMCG_TF_PREEMPT, or another flag with similar behavior, and
umcg_notify_resume will properly wake the server?

> +
> +	default:
> +		goto unblock;
> +	}
> +out:
> +	if (worker)
> +		tsk->flags |= PF_UMCG_WORKER;
> +	return ret;
> +
> +unpin:
> +	umcg_unpin_pages();
> +unblock:
> +	/*
> +	 * Workers will still block in umcg_notify_resume() before they can
> +	 * consume their error, servers however need to get the error asap.
> +	 *
> +	 * Still, things might be unrecoverably screwy after this. Not our
> +	 * problem.

I think we should explicitly document the unrecoverable screwiness
of errors here, so that the userspace proactively kills itself
to avoid badness. The only reason that returning an error here is
mildly preferable to just killing the task (we already do that
in other places) is to give the userspace an opportunity to
log an error, with more state/info than we can do here.

> +	 */
> +	if (!worker)
> +		umcg_update_state(tsk, self, UMCG_TASK_RUNNABLE, UMCG_TASK_RUNNING);
> +	goto out;
> +}
> +
> +/**
> + * sys_umcg_ctl: (un)register the current task as a UMCG task.
> + * @flags:       ORed values from enum umcg_ctl_flag; see below;
> + * @self:        a pointer to struct umcg_task that describes this
> + *               task and governs the behavior of sys_umcg_wait if
> + *               registering; must be NULL if unregistering.

@which_clock is not documented. Why do we need the option in the first
place?

> + *
> + * @flags & UMCG_CTL_REGISTER: register a UMCG task:
> + *
> + *         UMCG workers:
> + *              - @flags & UMCG_CTL_WORKER
> + *              - self->state must be UMCG_TASK_BLOCKED
> + *
> + *         UMCG servers:
> + *              - !(@flags & UMCG_CTL_WORKER)
> + *              - self->state must be UMCG_TASK_RUNNING
> + *
> + *         All tasks:
> + *              - self->server_tid must be a valid server
> + *              - self->next_tid must be zero
> + *
> + *         If the conditions above are met, sys_umcg_ctl() immediately returns
> + *         if the registered task is a server. If the registered task is a
> + *         worker it will be added to it's server's runnable_workers_ptr list
> + *         and the server will be woken.
> + *
> + * @flags == UMCG_CTL_UNREGISTER: unregister a UMCG task. If the current task
> + *           is a UMCG worker, the userspace is responsible for waking its
> + *           server (before or after calling sys_umcg_ctl).
> + *
> + * Return:
> + * 0		- success
> + * -EFAULT	- failed to read @self
> + * -EINVAL	- some other error occurred
> + * -ESRCH	- no such server_tid
> + */
> +SYSCALL_DEFINE3(umcg_ctl, u32, flags, struct umcg_task __user *, self, clockid_t, which_clock)
> +{
> +	struct task_struct *server;
> +	struct umcg_task ut;
> +
> +	if ((unsigned long)self % UMCG_TASK_ALIGN)
> +		return -EINVAL;
> +
> +	if (flags & ~(UMCG_CTL_REGISTER |
> +		      UMCG_CTL_UNREGISTER |
> +		      UMCG_CTL_WORKER))
> +		return -EINVAL;
> +
> +	if (flags == UMCG_CTL_UNREGISTER) {
> +		if (self || !current->umcg_task)
> +			return -EINVAL;
> +
> +		if (current->flags & PF_UMCG_WORKER)
> +			umcg_worker_exit();

The server should be woken here. Imagine: one server, one worker.
The server is sleeping, the worker is running. The worker unregisters,
the server keeps sleeping forever?

I'm OK re: NOT waking the server if the worker thread exits without
unregistering, as this is the userspace breaking the contract/protocol.
But here we do need to notify the server. At the minimum so that the
server can schedule a worker to run in its place.

(Why is this important? Worker count can fluctuate considerably:
on load spikes many new workers may be created, and later in
quiet times they exit to free resources.)

> +		else
> +			umcg_clear_task(current);
> +
> +		return 0;
> +	}
> +
> +	if (!(flags & UMCG_CTL_REGISTER))
> +		return -EINVAL;
> +
> +	flags &= ~UMCG_CTL_REGISTER;
> +
> +	switch (which_clock) {
> +	case CLOCK_REALTIME:
> +	case CLOCK_MONOTONIC:
> +	case CLOCK_BOOTTIME:
> +	case CLOCK_TAI:
> +		current->umcg_clock = which_clock;
> +		break;
> +
> +	default:
> +		return -EINVAL;
> +	}
> +
> +	if (current->umcg_task || !self)
> +		return -EINVAL;
> +
> +	if (copy_from_user(&ut, self, sizeof(ut)))
> +		return -EFAULT;
> +
> +	if (ut.next_tid || ut.__hole[0] || ut.__zero[0] || ut.__zero[1] || ut.__zero[2])
> +		return -EINVAL;
> +
> +	rcu_read_lock();
> +	server = find_task_by_vpid(ut.server_tid);
> +	if (server && server->mm == current->mm) {
> +		if (flags == UMCG_CTL_WORKER) {
> +			if (!server->umcg_task ||
> +			    (server->flags & PF_UMCG_WORKER))
> +				server = NULL;
> +		} else {
> +			if (server != current)
> +				server = NULL;
> +		}
> +	} else {
> +		server = NULL;
> +	}
> +	rcu_read_unlock();
> +
> +	if (!server)
> +		return -ESRCH;
> +
> +	if (flags == UMCG_CTL_WORKER) {
> +		if ((ut.state & (UMCG_TASK_MASK | UMCG_TF_MASK)) != UMCG_TASK_BLOCKED)
> +			return -EINVAL;
> +
> +		WRITE_ONCE(current->umcg_task, self);
> +		current->flags |= PF_UMCG_WORKER;	/* hook schedule() */
> +		set_syscall_work(SYSCALL_UMCG);		/* hook syscall */
> +		set_thread_flag(TIF_UMCG);		/* hook return-to-user */

Too many flags, I'd say. Not a big deal, just a mild preference:
I have only a single flag.

> +
> +		/* umcg_sys_exit() will transition to RUNNABLE and wait */
> +
> +	} else {
> +		if ((ut.state & (UMCG_TASK_MASK | UMCG_TF_MASK)) != UMCG_TASK_RUNNING)
> +			return -EINVAL;
> +
> +		WRITE_ONCE(current->umcg_task, self);
> +		set_thread_flag(TIF_UMCG);		/* hook return-to-user */

Why do we need to hook server's return to user? I did not need it in my
version.

> +
> +		/* umcg_notify_resume() would block if not RUNNING */
> +	}
> +
> +	return 0;
> +}
> --- a/kernel/sys_ni.c
> +++ b/kernel/sys_ni.c
> @@ -273,6 +273,11 @@ COND_SYSCALL(landlock_create_ruleset);
>  COND_SYSCALL(landlock_add_rule);
>  COND_SYSCALL(landlock_restrict_self);
>
> +/* kernel/sched/umcg.c */
> +COND_SYSCALL(umcg_ctl);
> +COND_SYSCALL(umcg_wait);
> +COND_SYSCALL(umcg_kick);
> +
>  /* arch/example/kernel/sys_example.c */
>
>  /* mm/fadvise.c */
>
>
Peter Zijlstra Dec. 24, 2021, 11:27 a.m. UTC | #2
On Tue, Dec 14, 2021 at 09:44:48PM +0100, Peter Zijlstra wrote:

> The big assumption this whole thing is build on is that
> pin_user_pages() preserves user mappings in so far that
> pagefault_disable() will never generate EFAULT (unless the user does
> munmap() in which case it can keep the pieces).
> 
> shrink_page_list() does page_maybe_dma_pinned() before try_to_unmap()
> and as such seems to respect this constraint.
> 
> unmap_and_move() however seems willing to unmap otherwise pinned (and
> hence unmigratable) pages. This might need fixing.

AFAICT this should mostly do,.. I still need to check if
get_user_pages_fast() is itself sufficient to avoid all races or if we
need to strengthen/augment that too.

---
 mm/migrate.c  | 10 +++++++++-
 mm/mprotect.c |  6 ++++++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/mm/migrate.c b/mm/migrate.c
index cf25b00f03c8..3850b33c64eb 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1472,7 +1472,15 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
 			nr_subpages = thp_nr_pages(page);
 			cond_resched();
 
-			if (PageHuge(page))
+			/*
+			 * If the page has a pin then expected_page_refs() will
+			 * not match and the whole migration will fail later
+			 * anyway, fail early and preserve the mappings.
+			 */
+			if (page_maybe_dma_pinned(page))
+				rc = -EAGAIN;
+
+			else if (PageHuge(page))
 				rc = unmap_and_move_huge_page(get_new_page,
 						put_new_page, private, page,
 						pass > 2, mode, reason,
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 9a105fce5aeb..093db725d39f 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -105,6 +105,12 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 				if (page_is_file_lru(page) && PageDirty(page))
 					continue;
 
+				/*
+				 * Can't migrate pinned pages, avoid touching them.
+				 */
+				if (page_maybe_dma_pinned(page))
+					continue;
+
 				/*
 				 * Don't mess with PTEs if page is already on the node
 				 * a single-threaded process is running on.
Peter Zijlstra Jan. 14, 2022, 2:09 p.m. UTC | #3
Hi!

I've seen you send a new version based on this, but I figured I ought to
reply to this first.

On Tue, Dec 21, 2021 at 05:19:00PM +0000, Peter Oskolkov wrote:

> > +/* pre-schedule() */
> > +void umcg_wq_worker_sleeping(struct task_struct *tsk)
> > +{
> > +	struct umcg_task __user *self = READ_ONCE(tsk->umcg_task);
> > +
> > +	/* Must not fault, mmap_sem might be held. */
> > +	pagefault_disable();
> > +
> > +	if (WARN_ON_ONCE(!tsk->umcg_server))
> > +		UMCG_DIE_PF("no server");
> 
> We can get here if a running worker (no pinned pages) gets a pagefault
> in the userspace. Is umcg_sys_enter() called for pagefaults? If not,
> we should not kill the worker; also the userspace won't be able to
> detect this worker blocking on a pagefault...

Ufff.. good one. No #PF doesn't pass through sys_enter, I'll have to go
fix that.

> Why don't you like my approach of pinning pages on exit_to_userspace
> and unpinning on going to sleep? Yes, the pins will last longer,
> but only for scheduled on CPU tasks, so bounded both by time and number
> (of course, if umcg_sys_enter() is called on pagefaults/signals/other
> interrupts, pinning in umcg_sys_enter() is better).

Well, in general I would not call userspace bounded. There's plenty
userspace that doesn't do syscalls for indeterminate amounts of time.
Now, such userspace might not be the immediate target for UMCG, but we
also should not rule it out.

Having been an mm/ developer in a previous lifetime, I still think
page-pins should be as short as possible. They can get in the way of
other things, like CMA.

> On the other hand, doing nothing on pagefaults and similar, and having
> to only worry about blocking in syscalls, does make things much simpler
> (no unexpected concurrency and such). I think most of the things
> you found complicated in my patchset, other than the SMP remote-idle wakeup,
> were driven by making sure spurious pagefaults are properly handled.
> 
> I can't tell now whether keeping workers RUNNING during pagefaults
> vs waking their servers to run pending workers is a net gain or loss
> re: performance. I'll have to benchmark this when my large test is ready.

I'll go fix the non syscall things that can schedule.

> > +int umcg_wait(u64 timo)
> > +{
> > +	struct task_struct *tsk = current;
> > +	struct umcg_task __user *self = tsk->umcg_task;
> > +	struct page *page = NULL;
> > +	u32 state;
> > +	int ret;
> > +
> > +	for (;;) {
> > +		set_current_state(TASK_INTERRUPTIBLE);
> > +
> > +		ret = -EINTR;
> > +		if (signal_pending(current))
> > +			break;
> > +
> > +		/*
> > +		 * Faults can block and scribble our wait state.
> > +		 */
> > +		pagefault_disable();
> > +		if (get_user(state, &self->state)) {
> > +			pagefault_enable();
> > +
> > +			ret = -EFAULT;
> > +			if (page) {
> > +				unpin_user_page(page);
> > +				page = NULL;
> > +				break;
> > +			}
> > +
> > +			if (pin_user_pages_fast((unsigned long)self, 1, 0, &page) != 1) {
> 
> I believe that the task should not be TASK_INTERRUPTIBLE here,
> as pin_user_pages_fast may fault, and might_fault complains via __might_sleep.

Fair enough; can easily mark the task __set_current_state(TASK_RUNNING)
right near pagefault_enable() or something.

> > +				page = NULL;
> > +				break;
> > +			}
> > +
> > +			continue;
> > +		}

> > +void umcg_sys_exit(struct pt_regs *regs)
> > +{
> > +	struct task_struct *tsk = current;
> > +	struct umcg_task __user *self = READ_ONCE(tsk->umcg_task);
> > +	long syscall = syscall_get_nr(tsk, regs);
> > +
> > +	if (syscall == __NR_umcg_wait)
> > +		return;
> > +
> > +	/*
> > +	 * sys_umcg_ctl() will get here without having called umcg_sys_enter()
> > +	 * as such it will look like a syscall that blocked.
> > +	 */
> > +
> > +	if (tsk->umcg_server) {
> > +		/*
> > +		 * Didn't block, we done.
> > +		 */
> > +		umcg_unpin_pages();
> > +		return;
> > +	}
> > +
> > +	/* avoid recursion vs schedule() */
> > +	current->flags &= ~PF_UMCG_WORKER;
> > +
> > +	if (umcg_pin_pages())
> > +		UMCG_DIE("pin");
> > +
> > +	if (umcg_update_state(tsk, self, UMCG_TASK_BLOCKED, UMCG_TASK_RUNNABLE))
> > +		UMCG_DIE_UNPIN("state");
> > +
> > +	if (umcg_enqueue_runnable(tsk))
> > +		UMCG_DIE_UNPIN("enqueue");
> > +
> > +	/* Server might not be RUNNABLE, means it's already running */
> > +	if (umcg_wake_server(tsk))
> > +		UMCG_DIE_UNPIN("wake-server");
> 
> So this here breaks the assumption that servers+workers never run
> on more CPUs than the number of servers, which I've gone through
> a lot of pain to ensure in my patchset.

Yes, but you also completely wrecked signals afaict. But yes, this
preemption thing also causes that, which is why I proposed that LAZY
crud earlier, but I never got that in a shape I was happy with -- it
quickly becomes a mess :/

> I think the assumption is based on the idea that a process
> using UMCG will get affined to N CPUs, will have N servers and
> a number of workers, and they will all happily cooperate and not
> get any extra threads running.
> 
> Of course the pretty picture was not completely true, as the unblocked
> tasks do consume extra threads in the kernel, though never in the
> userspace.

Right, there is some unmanaged time anyway.

> So this patch may result in all servers running due to wakeups
> in umcg_sys_exit(), with also their currently designated workers
> running as well, so the userspace may see N+N running threads.

I think this was already true, the servers could be running and all
workers could be woken from their in-kernel slumber, entering unmamanged
time, seeing N+M running tasks as worst possible case.

But yes, the 2N case is more common now.

> For now I think this may be OK, but as I mentioned above, I need to
> run a larger test with a real workload to see if anything is missing.
> 
> What does worry me is that in this wakeup the server calls sys_umcg_wait()
> with another worker in next_tid, so now the server will have two
> workers running: the current kernel API seems to allow this to happen.
> In my patchset the invariant that no more than one worker running
> per server was enforced by the kernel.

So one of the things I've started, but didn't finished, is to forward
port the Proxy-Execution patches to a current kernel and play with the
PE+UMCG interaction.

Thinking about that interaction I've ran into that exact problem.

The 'nice' solution is to actually block the worker, but that's also the
slow solution :/

The other solution seems to be to keep kernel state; track the current
worker associated with the server. I haven't (so far) done that due to
my futex trauma.

So yes, the current API can be used to do the wrong thing, but the
kernel doesn't care and you get to keep the pieces in userspace. And I
much prefer user pieces over kernel pieces.

> > +
> > +	umcg_unpin_pages();
> > +
> > +	switch (umcg_wait(0)) {
> > +	case -EFAULT:
> > +	case -EINVAL:
> > +	case -ETIMEDOUT: /* how!?! */
> > +	default:
> 
> This "default" coming before "case 0" below feels weird... can we do
> 
> 	switch (umcg_wait()) {
> 	case 0:
> 	case -EINTR:
> 		/* ... */
> 		break;
> 	default:
> 		UMCG_DIE("wait");
> 	}

Sure.

> > +		/*
> > +		 * XXX do we want a preemption consuming ::next_tid ?
> > +		 * I'm currently leaning towards no.
> 
> I don't think so: preemption is a sched-type event, so a server
> should handle it; next_tid has nothing to do with it.

We agree, I'll update the comment.

> > +SYSCALL_DEFINE2(umcg_wait, u32, flags, u64, timo)
> > +{
> > +	struct task_struct *tsk = current;
> > +	struct umcg_task __user *self = READ_ONCE(tsk->umcg_task);
> > +	bool worker = tsk->flags & PF_UMCG_WORKER;
> > +	int ret;
> > +
> > +	if (!self || flags)
> > +		return -EINVAL;
> > +
> > +	if (worker) {
> > +		tsk->flags &= ~PF_UMCG_WORKER;
> > +		if (timo)
> > +			return -ERANGE;
> 
> Worker sleeps timing out is a valid and a real use case. Similar
> to futex timeouts, mutex timeouts, condvar timeouts. I do not believe
> there is a fundamental problem here, so I'll add worker timeout
> handling in my larger test.

I don't understand worker timeout, also see:

  https://lkml.kernel.org/r/Ya34S2JCQg+81h4t@hirez.programming.kicks-ass.net

> In addition, shouldn't we NOT clear PF_UMCG_WORKER flag if we
> return an error?

Why? Userspace can do umcg_ctl() if they want, no?

> > +	}
> > +
> > +	/* see umcg_sys_{enter,exit}() syscall exceptions */
> > +	ret = umcg_pin_pages();
> 
> I do not think we need to pin pages for servers, only for workers. Yes,
> this makes things easier/simpler, so ok for now, but maybe later we will
> need to be a bit more fine-grained here.

Right.

> > +	if (ret)
> > +		goto unblock;
> > +
> > +	/*
> > +	 * Clear UMCG_TF_COND_WAIT *and* check state == RUNNABLE.
> > +	 */
> > +	ret = umcg_update_state(tsk, self, UMCG_TASK_RUNNABLE, UMCG_TASK_RUNNABLE);
> > +	if (ret)
> > +		goto unpin;
> > +
> > +	if (worker) {
> > +		ret = umcg_enqueue_runnable(tsk);
> > +		if (ret)
> > +			goto unpin;
> > +	}
> > +
> > +	if (worker)
> 
> Should this "if" be merged with the one above?

Yes, I think I've done that at least once, but clearly it didn't stick.

Ah, here it is:

  https://lkml.kernel.org/r/Ybm+HJzkO%2F0BB4Va@hirez.programming.kicks-ass.net

but since that LAZY thing didn't live that cleanup seems to have gone
out the window too.

> > +		ret = umcg_wake(tsk);
> > +	else if (tsk->umcg_next)
> > +		ret = umcg_wake_next(tsk);
> > +
> > +	if (ret) {
> > +		/*
> > +		 * XXX already enqueued ourself on ::server_tid; failing now
> > +		 * leaves the lot in an inconsistent state since it'll also
> > +		 * unblock self in order to return the error. !?!?
> > +		 */
> 
> It looks like only EFAULT can be here. I'd ensure that, and then just DIE.

Can also be -EAGAIN if the target task isn't in an expected state.

I also wanted to avoid DIE from the syscalls(). DIE really isn't nice,
we shouldn't do it if it can be avoided.

> > +		goto unpin;
> > +	}
> > +
> > +	umcg_unpin_pages();
> > +
> > +	ret = umcg_wait(timo);
> > +	switch (ret) {
> > +	case 0:		/* all done */
> > +	case -EINTR:	/* umcg_notify_resume() will continue the wait */
> > +		ret = 0;
> > +		break;
> 
> Why not let workers have timeouts, and keep -ETIMEDOUT here? Just set
> UMCG_TF_PREEMPT, or another flag with similar behavior, and
> umcg_notify_resume will properly wake the server?

I really don't understand timeouts on workers, see above.

TF_PREEMPT must only be set on RUNNING, but if we're in wait, we're
RUNNABLE.

> > +
> > +	default:
> > +		goto unblock;
> > +	}
> > +out:
> > +	if (worker)
> > +		tsk->flags |= PF_UMCG_WORKER;
> > +	return ret;
> > +
> > +unpin:
> > +	umcg_unpin_pages();
> > +unblock:
> > +	/*
> > +	 * Workers will still block in umcg_notify_resume() before they can
> > +	 * consume their error, servers however need to get the error asap.
> > +	 *
> > +	 * Still, things might be unrecoverably screwy after this. Not our
> > +	 * problem.
> 
> I think we should explicitly document the unrecoverable screwiness
> of errors here, so that the userspace proactively kills itself
> to avoid badness. The only reason that returning an error here is
> mildly preferable to just killing the task (we already do that
> in other places) is to give the userspace an opportunity to
> log an error, with more state/info than we can do here.

Bah, I should've written a better comment, because I can't quite
remember the case I had in mind. Also, again from the LAZY patch, I
think we can actually do better in some of the cases here.

Specifically, currently we'll enqueue on ::runnable_workers_ptr and fail
waking ::next_tid and leave it at that. While I think waking
::server_tid in that case makes sense.

I'll go prod at this.

> > +	 */
> > +	if (!worker)
> > +		umcg_update_state(tsk, self, UMCG_TASK_RUNNABLE, UMCG_TASK_RUNNING);
> > +	goto out;
> > +}
> > +
> > +/**
> > + * sys_umcg_ctl: (un)register the current task as a UMCG task.
> > + * @flags:       ORed values from enum umcg_ctl_flag; see below;
> > + * @self:        a pointer to struct umcg_task that describes this
> > + *               task and governs the behavior of sys_umcg_wait if
> > + *               registering; must be NULL if unregistering.
> 
> @which_clock is not documented. Why do we need the option in the first
> place?

Well, you had CLOCK_REALTIME, which I think is quite daft, but Thomas
also wanted CLOCK_TAI, so here we are.

I'll add the comment.

> > +SYSCALL_DEFINE3(umcg_ctl, u32, flags, struct umcg_task __user *, self, clockid_t, which_clock)
> > +{
> > +	struct task_struct *server;
> > +	struct umcg_task ut;
> > +
> > +	if ((unsigned long)self % UMCG_TASK_ALIGN)
> > +		return -EINVAL;
> > +
> > +	if (flags & ~(UMCG_CTL_REGISTER |
> > +		      UMCG_CTL_UNREGISTER |
> > +		      UMCG_CTL_WORKER))
> > +		return -EINVAL;
> > +
> > +	if (flags == UMCG_CTL_UNREGISTER) {
> > +		if (self || !current->umcg_task)
> > +			return -EINVAL;
> > +
> > +		if (current->flags & PF_UMCG_WORKER)
> > +			umcg_worker_exit();
> 
> The server should be woken here. Imagine: one server, one worker.
> The server is sleeping, the worker is running. The worker unregisters,
> the server keeps sleeping forever?
> 
> I'm OK re: NOT waking the server if the worker thread exits without
> unregistering, as this is the userspace breaking the contract/protocol.
> But here we do need to notify the server. At the minimum so that the
> server can schedule a worker to run in its place.
> 
> (Why is this important? Worker count can fluctuate considerably:
> on load spikes many new workers may be created, and later in
> quiet times they exit to free resources.)

Fair enough. Will do.

> > +	if (flags == UMCG_CTL_WORKER) {
> > +		if ((ut.state & (UMCG_TASK_MASK | UMCG_TF_MASK)) != UMCG_TASK_BLOCKED)
> > +			return -EINVAL;
> > +
> > +		WRITE_ONCE(current->umcg_task, self);
> > +		current->flags |= PF_UMCG_WORKER;	/* hook schedule() */
> > +		set_syscall_work(SYSCALL_UMCG);		/* hook syscall */
> > +		set_thread_flag(TIF_UMCG);		/* hook return-to-user */
> 
> Too many flags, I'd say. Not a big deal, just a mild preference:
> I have only a single flag.

Yeah, you overloaded TIF_NOTIFY_RESUME, I prefer an explicit flag. And
the syscall things already have their own flag word, so that simply
needs another one.

> > +
> > +		/* umcg_sys_exit() will transition to RUNNABLE and wait */
> > +
> > +	} else {
> > +		if ((ut.state & (UMCG_TASK_MASK | UMCG_TF_MASK)) != UMCG_TASK_RUNNING)
> > +			return -EINVAL;
> > +
> > +		WRITE_ONCE(current->umcg_task, self);
> > +		set_thread_flag(TIF_UMCG);		/* hook return-to-user */
> 
> Why do we need to hook server's return to user? I did not need it in my
> version.

Signals; server could be blocked in sys_umcg_wait() and get a signal,
then we should resume waiting after the signal.

I hate signals as much as the next guy, but we gotta do something with
them.

Anyway, let me go poke at this code again..
Peter Zijlstra Jan. 14, 2022, 3:16 p.m. UTC | #4
On Fri, Jan 14, 2022 at 03:09:55PM +0100, Peter Zijlstra wrote:

> > I think the assumption is based on the idea that a process
> > using UMCG will get affined to N CPUs, will have N servers and
> > a number of workers, and they will all happily cooperate and not
> > get any extra threads running.
> > 
> > Of course the pretty picture was not completely true, as the unblocked
> > tasks do consume extra threads in the kernel, though never in the
> > userspace.
> 
> Right, there is some unmanaged time anyway.

Also, since we force wake to the same CPU, and overlapping runtime is
'short', they should all stick to the same CPU, even if we don't pin.
Peter Zijlstra Jan. 14, 2022, 5:15 p.m. UTC | #5
On Fri, Jan 14, 2022 at 03:09:55PM +0100, Peter Zijlstra wrote:
> 
> Hi!
> 
> I've seen you send a new version based on this, but I figured I ought to
> reply to this first.
> 
> On Tue, Dec 21, 2021 at 05:19:00PM +0000, Peter Oskolkov wrote:
> 
> > > +/* pre-schedule() */
> > > +void umcg_wq_worker_sleeping(struct task_struct *tsk)
> > > +{
> > > +	struct umcg_task __user *self = READ_ONCE(tsk->umcg_task);
> > > +
> > > +	/* Must not fault, mmap_sem might be held. */
> > > +	pagefault_disable();
> > > +
> > > +	if (WARN_ON_ONCE(!tsk->umcg_server))
> > > +		UMCG_DIE_PF("no server");
> > 
> > We can get here if a running worker (no pinned pages) gets a pagefault
> > in the userspace. Is umcg_sys_enter() called for pagefaults? If not,
> > we should not kill the worker; also the userspace won't be able to
> > detect this worker blocking on a pagefault...
> 
> Ufff.. good one. No #PF doesn't pass through sys_enter, I'll have to go
> fix that.

Something like the below I think.. it builds, but I've not yet tested
it.


---
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -73,18 +73,6 @@
 
 DECLARE_BITMAP(system_vectors, NR_VECTORS);
 
-static inline void cond_local_irq_enable(struct pt_regs *regs)
-{
-	if (regs->flags & X86_EFLAGS_IF)
-		local_irq_enable();
-}
-
-static inline void cond_local_irq_disable(struct pt_regs *regs)
-{
-	if (regs->flags & X86_EFLAGS_IF)
-		local_irq_disable();
-}
-
 __always_inline int is_valid_bugaddr(unsigned long addr)
 {
 	if (addr < TASK_SIZE_MAX)
@@ -177,9 +165,9 @@ static void do_error_trap(struct pt_regs
 
 	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) !=
 			NOTIFY_STOP) {
-		cond_local_irq_enable(regs);
+		irqentry_irq_enable(regs);
 		do_trap(trapnr, signr, str, regs, error_code, sicode, addr);
-		cond_local_irq_disable(regs);
+		irqentry_irq_disable(regs);
 	}
 }
 
@@ -300,7 +288,7 @@ DEFINE_IDTENTRY_ERRORCODE(exc_alignment_
 	if (!user_mode(regs))
 		die("Split lock detected\n", regs, error_code);
 
-	local_irq_enable();
+	irqentry_irq_enable(regs);
 
 	if (handle_user_split_lock(regs, error_code))
 		goto out;
@@ -309,7 +297,7 @@ DEFINE_IDTENTRY_ERRORCODE(exc_alignment_
 		error_code, BUS_ADRALN, NULL);
 
 out:
-	local_irq_disable();
+	irqentry_irq_disable(regs);
 }
 
 #ifdef CONFIG_VMAP_STACK
@@ -473,14 +461,14 @@ DEFINE_IDTENTRY(exc_bounds)
 	if (notify_die(DIE_TRAP, "bounds", regs, 0,
 			X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP)
 		return;
-	cond_local_irq_enable(regs);
+	irqentry_irq_enable(regs);
 
 	if (!user_mode(regs))
 		die("bounds", regs, 0);
 
 	do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, 0, 0, NULL);
 
-	cond_local_irq_disable(regs);
+	irqentry_irq_disable(regs);
 }
 
 enum kernel_gp_hint {
@@ -567,7 +555,7 @@ DEFINE_IDTENTRY_ERRORCODE(exc_general_pr
 	unsigned long gp_addr;
 	int ret;
 
-	cond_local_irq_enable(regs);
+	irqentry_irq_enable(regs);
 
 	if (static_cpu_has(X86_FEATURE_UMIP)) {
 		if (user_mode(regs) && fixup_umip_exception(regs))
@@ -638,7 +626,7 @@ DEFINE_IDTENTRY_ERRORCODE(exc_general_pr
 	die_addr(desc, regs, error_code, gp_addr);
 
 exit:
-	cond_local_irq_disable(regs);
+	irqentry_irq_disable(regs);
 }
 
 static bool do_int3(struct pt_regs *regs)
@@ -665,9 +653,9 @@ static void do_int3_user(struct pt_regs
 	if (do_int3(regs))
 		return;
 
-	cond_local_irq_enable(regs);
+	irqentry_irq_enable(regs);
 	do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, 0, 0, NULL);
-	cond_local_irq_disable(regs);
+	irqentry_irq_disable(regs);
 }
 
 DEFINE_IDTENTRY_RAW(exc_int3)
@@ -1003,7 +991,7 @@ static __always_inline void exc_debug_us
 		goto out;
 
 	/* It's safe to allow irq's after DR6 has been saved */
-	local_irq_enable();
+	irqentry_irq_enable(regs);
 
 	if (v8086_mode(regs)) {
 		handle_vm86_trap((struct kernel_vm86_regs *)regs, 0, X86_TRAP_DB);
@@ -1020,7 +1008,7 @@ static __always_inline void exc_debug_us
 		send_sigtrap(regs, 0, get_si_code(dr6));
 
 out_irq:
-	local_irq_disable();
+	irqentry_irq_disable(regs);
 out:
 	instrumentation_end();
 	irqentry_exit_to_user_mode(regs);
@@ -1064,7 +1052,7 @@ static void math_error(struct pt_regs *r
 	char *str = (trapnr == X86_TRAP_MF) ? "fpu exception" :
 						"simd exception";
 
-	cond_local_irq_enable(regs);
+	irqentry_irq_enable(regs);
 
 	if (!user_mode(regs)) {
 		if (fixup_exception(regs, trapnr, 0, 0))
@@ -1099,7 +1087,7 @@ static void math_error(struct pt_regs *r
 	force_sig_fault(SIGFPE, si_code,
 			(void __user *)uprobe_get_trap_addr(regs));
 exit:
-	cond_local_irq_disable(regs);
+	irqentry_irq_disable(regs);
 }
 
 DEFINE_IDTENTRY(exc_coprocessor_error)
@@ -1160,7 +1148,7 @@ static bool handle_xfd_event(struct pt_r
 	if (WARN_ON(!user_mode(regs)))
 		return false;
 
-	local_irq_enable();
+	irqentry_irq_enable(regs);
 
 	err = xfd_enable_feature(xfd_err);
 
@@ -1173,7 +1161,7 @@ static bool handle_xfd_event(struct pt_r
 		break;
 	}
 
-	local_irq_disable();
+	irqentry_irq_disable(regs);
 	return true;
 }
 
@@ -1188,12 +1176,12 @@ DEFINE_IDTENTRY(exc_device_not_available
 	if (!boot_cpu_has(X86_FEATURE_FPU) && (cr0 & X86_CR0_EM)) {
 		struct math_emu_info info = { };
 
-		cond_local_irq_enable(regs);
+		irqentry_irq_enable(regs);
 
 		info.regs = regs;
 		math_emulate(&info);
 
-		cond_local_irq_disable(regs);
+		irqentry_irq_disable(regs);
 		return;
 	}
 #endif
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1209,6 +1209,12 @@ do_kern_addr_fault(struct pt_regs *regs,
 NOKPROBE_SYMBOL(do_kern_addr_fault);
 
 /*
+ * EFLAGS[3] is unused and ABI defined to be 0, use it to store IRQ state,
+ * because do_user_addr_fault() is too convoluted to track things.
+ */
+#define X86_EFLAGS_MISC		(1UL << 3)
+
+/*
  * Handle faults in the user portion of the address space.  Nothing in here
  * should check X86_PF_USER without a specific justification: for almost
  * all purposes, we should treat a normal kernel access to user memory
@@ -1290,13 +1296,11 @@ void do_user_addr_fault(struct pt_regs *
 	 * User-mode registers count as a user access even for any
 	 * potential system fault or CPU buglet:
 	 */
-	if (user_mode(regs)) {
-		local_irq_enable();
+	if (user_mode(regs))
 		flags |= FAULT_FLAG_USER;
-	} else {
-		if (regs->flags & X86_EFLAGS_IF)
-			local_irq_enable();
-	}
+
+	irqentry_irq_enable(regs);
+	regs->flags |= X86_EFLAGS_MISC;
 
 	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
 
@@ -1483,14 +1487,10 @@ handle_page_fault(struct pt_regs *regs,
 		do_kern_addr_fault(regs, error_code, address);
 	} else {
 		do_user_addr_fault(regs, error_code, address);
-		/*
-		 * User address page fault handling might have reenabled
-		 * interrupts. Fixing up all potential exit points of
-		 * do_user_addr_fault() and its leaf functions is just not
-		 * doable w/o creating an unholy mess or turning the code
-		 * upside down.
-		 */
-		local_irq_disable();
+		if (regs->flags & X86_EFLAGS_MISC) {
+			regs->flags &= ~X86_EFLAGS_MISC;
+			irqentry_irq_disable(regs);
+		}
 	}
 }
 
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -7,6 +7,7 @@
 #include <linux/syscalls.h>
 #include <linux/seccomp.h>
 #include <linux/sched.h>
+#include <asm/ptrace.h>
 
 #include <asm/entry-common.h>
 
@@ -218,6 +219,24 @@ static inline void local_irq_disable_exi
 }
 #endif
 
+static inline void irqentry_irq_enable(struct pt_regs *regs)
+{
+	if (!regs_irqs_disabled(regs)) {
+		local_irq_enable();
+		if (user_mode(regs) && (current->flags & PF_UMCG_WORKER))
+			umcg_sys_enter(regs, 0);
+	}
+}
+
+static inline void irqentry_irq_disable(struct pt_regs *regs)
+{
+	if (!regs_irqs_disabled(regs)) {
+		if (user_mode(regs) && (current->flags & PF_UMCG_WORKER))
+			umcg_sys_exit(regs);
+		local_irq_disable();
+	}
+}
+
 /**
  * arch_exit_to_user_mode_work - Architecture specific TIF work for exit
  *				 to user mode.
Peter Zijlstra Jan. 17, 2022, 11:35 a.m. UTC | #6
On Fri, Jan 14, 2022 at 03:09:55PM +0100, Peter Zijlstra wrote:
> > > +SYSCALL_DEFINE3(umcg_ctl, u32, flags, struct umcg_task __user *, self, clockid_t, which_clock)
> > > +{
> > > +	struct task_struct *server;
> > > +	struct umcg_task ut;
> > > +
> > > +	if ((unsigned long)self % UMCG_TASK_ALIGN)
> > > +		return -EINVAL;
> > > +
> > > +	if (flags & ~(UMCG_CTL_REGISTER |
> > > +		      UMCG_CTL_UNREGISTER |
> > > +		      UMCG_CTL_WORKER))
> > > +		return -EINVAL;
> > > +
> > > +	if (flags == UMCG_CTL_UNREGISTER) {
> > > +		if (self || !current->umcg_task)
> > > +			return -EINVAL;
> > > +
> > > +		if (current->flags & PF_UMCG_WORKER)
> > > +			umcg_worker_exit();
> > 
> > The server should be woken here. Imagine: one server, one worker.
> > The server is sleeping, the worker is running. The worker unregisters,
> > the server keeps sleeping forever?
> > 
> > I'm OK re: NOT waking the server if the worker thread exits without
> > unregistering, as this is the userspace breaking the contract/protocol.
> > But here we do need to notify the server. At the minimum so that the
> > server can schedule a worker to run in its place.
> > 
> > (Why is this important? Worker count can fluctuate considerably:
> > on load spikes many new workers may be created, and later in
> > quiet times they exit to free resources.)
> 
> Fair enough. Will do.

Something like so then...

---
--- a/kernel/sched/umcg.c
+++ b/kernel/sched/umcg.c
@@ -185,13 +185,6 @@ void umcg_clear_child(struct task_struct
 	umcg_clear_task(tsk);
 }
 
-/* Called both by normally (unregister) and abnormally exiting workers. */
-void umcg_worker_exit(void)
-{
-	umcg_unpin_pages();
-	umcg_clear_task(current);
-}
-
 /*
  * Do a state transition: @from -> @to.
  *
@@ -748,32 +741,43 @@ SYSCALL_DEFINE2(umcg_wait, u32, flags, u
  * sys_umcg_ctl: (un)register the current task as a UMCG task.
  * @flags:       ORed values from enum umcg_ctl_flag; see below;
  * @self:        a pointer to struct umcg_task that describes this
- *               task and governs the behavior of sys_umcg_wait if
- *               registering; must be NULL if unregistering.
+ *               task and governs the behavior of sys_umcg_wait.
  * @which_clock: clockid to use for timestamps and timeouts
  *
  * @flags & UMCG_CTL_REGISTER: register a UMCG task:
  *
- *         UMCG workers:
- *              - @flags & UMCG_CTL_WORKER
- *              - self->state must be UMCG_TASK_BLOCKED
- *
- *         UMCG servers:
- *              - !(@flags & UMCG_CTL_WORKER)
- *              - self->state must be UMCG_TASK_RUNNING
- *
- *         All tasks:
- *              - self->server_tid must be a valid server
- *              - self->next_tid must be zero
- *
- *         If the conditions above are met, sys_umcg_ctl() immediately returns
- *         if the registered task is a server. If the registered task is a
- *         worker it will be added to it's server's runnable_workers_ptr list
- *         and the server will be woken.
- *
- * @flags == UMCG_CTL_UNREGISTER: unregister a UMCG task. If the current task
- *           is a UMCG worker, the userspace is responsible for waking its
- *           server (before or after calling sys_umcg_ctl).
+ *	UMCG workers:
+ *	 - @flags & UMCG_CTL_WORKER
+ *	 - self->state must be UMCG_TASK_BLOCKED
+ *
+ *	UMCG servers:
+ *	 - !(@flags & UMCG_CTL_WORKER)
+ *	 - self->state must be UMCG_TASK_RUNNING
+ *
+ *	All tasks:
+ *	 - self->server_tid must be a valid server
+ *	 - self->next_tid must be zero
+ *
+ *	If the conditions above are met, sys_umcg_ctl() immediately returns
+ *	if the registered task is a server. If the registered task is a
+ *	worker it will be added to it's server's runnable_workers_ptr list
+ *	and the server will be woken.
+ *
+ * @flags & UMCG_CTL_UNREGISTER: unregister a UMCG task.
+ *
+ *	UMCG workers:
+ *	 - @flags & UMCG_CTL_WORKER
+ *
+ *	UMCG servers:
+ *	 - !(@flags & UMCG_CTL_WORKER)
+ *
+ *	All tasks:
+ *	 - self must match with UMCG_CTL_REGISTER
+ *	 - self->state must be UMCG_TASK_RUNNING
+ *	 - self->server_tid must be a valid server
+ *
+ * 	If the conditions above are met, sys_umcg_ctl() will change state to
+ * 	UMCG_TASK_NONE, and for workers, wake either next or server.
  *
  * Return:
  * 0		- success
@@ -794,16 +798,31 @@ SYSCALL_DEFINE3(umcg_ctl, u32, flags, st
 		      UMCG_CTL_WORKER))
 		return -EINVAL;
 
-	if (flags == UMCG_CTL_UNREGISTER) {
-		if (self || !current->umcg_task)
+	if (flags & UMCG_CTL_UNREGISTER) {
+		int ret;
+
+		if (!self || self != current->umcg_task)
 			return -EINVAL;
 
-		if (current->flags & PF_UMCG_WORKER) {
-			umcg_worker_exit();
-			// XXX wake server
-		} else
-			umcg_clear_task(current);
+		current->flags &= ~PF_UMCG_WORKER;
 
+		ret = umcg_pin_pages();
+		if (ret) {
+			current->flags |= PF_UMCG_WORKER;
+			return ret;
+		}
+
+		ret = umcg_update_state(current, self, UMCG_TASK_RUNNING, UMCG_TASK_NONE);
+		if (ret) {
+			current->flags |= PF_UMCG_WORKER;
+			return ret;
+		}
+
+		if (current->flags & PF_UMCG_WORKER)
+			umcg_wake(current);
+
+		umcg_unpin_pages();
+		umcg_clear_task(current);
 		return 0;
 	}
Peter Zijlstra Jan. 17, 2022, 12:12 p.m. UTC | #7
On Fri, Jan 14, 2022 at 03:09:55PM +0100, Peter Zijlstra wrote:
> On Tue, Dec 21, 2021 at 05:19:00PM +0000, Peter Oskolkov wrote:

> > > +	/*
> > > +	 * Workers will still block in umcg_notify_resume() before they can
> > > +	 * consume their error, servers however need to get the error asap.
> > > +	 *
> > > +	 * Still, things might be unrecoverably screwy after this. Not our
> > > +	 * problem.
> > 
> > I think we should explicitly document the unrecoverable screwiness
> > of errors here, so that the userspace proactively kills itself
> > to avoid badness. The only reason that returning an error here is
> > mildly preferable to just killing the task (we already do that
> > in other places) is to give the userspace an opportunity to
> > log an error, with more state/info than we can do here.
> 
> Bah, I should've written a better comment, because I can't quite
> remember the case I had in mind. Also, again from the LAZY patch, I
> think we can actually do better in some of the cases here.
> 
> Specifically, currently we'll enqueue on ::runnable_workers_ptr and fail
> waking ::next_tid and leave it at that. While I think waking
> ::server_tid in that case makes sense.
> 
> I'll go prod at this.

Is anybody actually planning to use ::next_tid for workers?

My current thinking is that much of the problems here stem from that.

Let me ponder more..
Peter Zijlstra Jan. 17, 2022, 12:22 p.m. UTC | #8
On Mon, Jan 17, 2022 at 12:35:29PM +0100, Peter Zijlstra wrote:
> @@ -794,16 +798,31 @@ SYSCALL_DEFINE3(umcg_ctl, u32, flags, st
>  		      UMCG_CTL_WORKER))
>  		return -EINVAL;
>  
> -	if (flags == UMCG_CTL_UNREGISTER) {
> -		if (self || !current->umcg_task)
> +	if (flags & UMCG_CTL_UNREGISTER) {
> +		int ret;
> +
> +		if (!self || self != current->umcg_task)
>  			return -EINVAL;
>  
> -		if (current->flags & PF_UMCG_WORKER) {
> -			umcg_worker_exit();
> -			// XXX wake server
> -		} else
> -			umcg_clear_task(current);
> +		current->flags &= ~PF_UMCG_WORKER;
>  
> +		ret = umcg_pin_pages();
> +		if (ret) {
> +			current->flags |= PF_UMCG_WORKER;
> +			return ret;
> +		}
> +
> +		ret = umcg_update_state(current, self, UMCG_TASK_RUNNING, UMCG_TASK_NONE);
> +		if (ret) {
> +			current->flags |= PF_UMCG_WORKER;
> +			return ret;
> +		}

Sorry, should obv only restore PF_UMCG_WORKER for workers.. /me fixes

> +
> +		if (current->flags & PF_UMCG_WORKER)
> +			umcg_wake(current);
> +
> +		umcg_unpin_pages();
> +		umcg_clear_task(current);
>  		return 0;
>  	}
>
Peter Zijlstra Jan. 17, 2022, 1:04 p.m. UTC | #9
On Tue, Dec 21, 2021 at 05:19:00PM +0000, Peter Oskolkov wrote:
> On Tue, Dec 14, 2021 at 09:44:48PM +0100, Peter Zijlstra wrote:
> > +static struct task_struct *umcg_get_task(u32 tid)
> > +{
> > +	struct task_struct *tsk = NULL;
> > +
> > +	if (tid) {
> > +		rcu_read_lock();
> > +		tsk = find_task_by_vpid(tid);
> > +		if (tsk && current->mm == tsk->mm && tsk->umcg_task)
> 
> This essentially limits all operations to a single mm/process.
> Fine for now, but a fast remote context switch is also a valid use
> case. It is not directly related to userspace scheduling, so I'm
> just mentioning it here. Maybe server-to-server cross-process
> context switches should be allowed for the same user/cgroup? (Again,
> this is for later to consider).

Doing cross-address-space UMCG will be a massive effort, too much
(pretty much everything) in this implementation assumes things are
directly addressable.

> > +			get_task_struct(tsk);
> > +		else
> > +			tsk = NULL;
> > +		rcu_read_unlock();
> > +	}
> > +
> > +	return tsk;
> > +}
Peter Zijlstra Jan. 18, 2022, 10:09 a.m. UTC | #10
On Fri, Jan 14, 2022 at 03:09:55PM +0100, Peter Zijlstra wrote:
> On Tue, Dec 21, 2021 at 05:19:00PM +0000, Peter Oskolkov wrote:

> > What does worry me is that in this wakeup the server calls sys_umcg_wait()
> > with another worker in next_tid, so now the server will have two
> > workers running: the current kernel API seems to allow this to happen.
> > In my patchset the invariant that no more than one worker running
> > per server was enforced by the kernel.
> 
> So one of the things I've started, but didn't finished, is to forward
> port the Proxy-Execution patches to a current kernel and play with the
> PE+UMCG interaction.
> 
> Thinking about that interaction I've ran into that exact problem.
> 
> The 'nice' solution is to actually block the worker, but that's also the
> slow solution :/
> 
> The other solution seems to be to keep kernel state; track the current
> worker associated with the server. I haven't (so far) done that due to
> my futex trauma.
> 
> So yes, the current API can be used to do the wrong thing, but the
> kernel doesn't care and you get to keep the pieces in userspace. And I
> much prefer user pieces over kernel pieces.

So I think I came up with a 3rd option; since the TID range is 30 bits
(per FUTEX_TID_MASK) we have those same two top bits to play with.

So we write into server::next_tid the tid of the worker we want to wake;
and we currently have it cleared such that we can distinguish between
the case where sys_umcg_wait() returned an error before or after waking
it.

However; we can use one of the 2 remaining bits to indicate the worker
is woken, let's say bit 31. This then has server::next_tid always
containing the current tid, even when the server has a 'spurious' wakeup
for other things.

Then all we need to do is modify the state check when the bit is set to
ensure we don't wake the worker again if we race sys_umcg_wait() with a
worker blocking.

A bit like this, I suppose... (incompete patch in that it relies on a
whole pile of local changes that might or might not live).

--- a/include/uapi/linux/umcg.h
+++ b/include/uapi/linux/umcg.h
@@ -94,6 +94,8 @@ struct umcg_task {
 	 */
 	__u32	state;				/* r/w */
 
+#define UMCG_TID_RUNNING	0x80000000U
+#define UMCG_TID_MASK		0x3fffffffU
 	/**
 	 * @next_tid: the TID of the UMCG task that should be context-switched
 	 *            into in sys_umcg_wait(). Can be zero, in which case
--- a/kernel/sched/umcg.c
+++ b/kernel/sched/umcg.c
@@ -20,7 +20,7 @@ static struct task_struct *umcg_get_task
 
 	if (tid) {
 		rcu_read_lock();
-		tsk = find_task_by_vpid(tid);
+		tsk = find_task_by_vpid(tid & UMCG_TID_MASK);
 		if (tsk && current->mm == tsk->mm && tsk->umcg_task)
 			get_task_struct(tsk);
 		else
@@ -289,27 +291,6 @@ static int umcg_wake_task(struct task_st
 	return 0;
 }
 
-static int umcg_wake_next(struct task_struct *tsk)
-{
-	int ret = umcg_wake_task(tsk->umcg_next, tsk->umcg_next_task);
-	if (ret)
-		return ret;
-
-	/*
-	 * If userspace sets umcg_task::next_tid, it needs to remove
-	 * that task from the ready-queue to avoid another server
-	 * selecting it. However, that also means it needs to put it
-	 * back in case it went unused.
-	 *
-	 * By clearing the field on use, userspace can detect this case
-	 * and DTRT.
-	 */
-	if (put_user(0u, &tsk->umcg_task->next_tid))
-		return -EFAULT;
-
-	return 0;
-}
-
 static int umcg_wake_server(struct task_struct *tsk)
 {
 	int ret = umcg_wake_task(tsk->umcg_server, tsk->umcg_server_task);
@@ -637,6 +599,48 @@ SYSCALL_DEFINE2(umcg_kick, u32, flags, p
 	return 0;
 }
 
+static int umcg_wake_next(struct task_struct *tsk, struct umcg_task __user *self)
+{
+	struct umcg_task __user *next_task;
+	struct task_struct *next;
+	u32 next_tid, state;
+	int ret;
+
+	if (get_user(next_tid, &self->next_tid))
+		return -EFAULT;
+
+	next = umcg_get_task(next_tid);
+	if (!next)
+		return -ESRCH;
+
+	next_task = READ_ONCE(next->umcg_task);
+
+	if (next_tid & UMCG_TID_RUNNING) {
+		ret = -EFAULT;
+		if (get_user(state, &next_task->state))
+			goto put_next;
+
+		ret = 0;
+		if ((state & UMCG_TASK_MASK) != UMCG_TASK_RUNNING)
+			ret = -EAGAIN;
+
+	} else {
+		ret = umcg_wake_task(next, next_task);
+		if (ret)
+			goto put_next;
+
+		ret = -EFAULT;
+		if (put_user(next_tid | UMCG_TID_RUNNING, &self->next_tid))
+			goto put_next;
+
+		ret = 0;
+	}
+
+put_next:
+	put_task_struct(next);
+	return ret;
+}
+
 /**
  * sys_umcg_wait: transfer running context
  *


And once we have this, we can add sanity checks that server::next_tid is
what it should be for ever worker moving away from RUNNING state (which
depends on the assumption that all threads are in the same PID
namespace).


Does this make sense?
Peter Oskolkov Jan. 18, 2022, 6:19 p.m. UTC | #11
On Tue, Jan 18, 2022 at 2:09 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Fri, Jan 14, 2022 at 03:09:55PM +0100, Peter Zijlstra wrote:
> > On Tue, Dec 21, 2021 at 05:19:00PM +0000, Peter Oskolkov wrote:
>
> > > What does worry me is that in this wakeup the server calls sys_umcg_wait()
> > > with another worker in next_tid, so now the server will have two
> > > workers running: the current kernel API seems to allow this to happen.
> > > In my patchset the invariant that no more than one worker running
> > > per server was enforced by the kernel.
> >
> > So one of the things I've started, but didn't finished, is to forward
> > port the Proxy-Execution patches to a current kernel and play with the
> > PE+UMCG interaction.
> >
> > Thinking about that interaction I've ran into that exact problem.
> >
> > The 'nice' solution is to actually block the worker, but that's also the
> > slow solution :/
> >
> > The other solution seems to be to keep kernel state; track the current
> > worker associated with the server. I haven't (so far) done that due to
> > my futex trauma.
> >
> > So yes, the current API can be used to do the wrong thing, but the
> > kernel doesn't care and you get to keep the pieces in userspace. And I
> > much prefer user pieces over kernel pieces.
>
> So I think I came up with a 3rd option; since the TID range is 30 bits
> (per FUTEX_TID_MASK) we have those same two top bits to play with.
>
> So we write into server::next_tid the tid of the worker we want to wake;
> and we currently have it cleared such that we can distinguish between
> the case where sys_umcg_wait() returned an error before or after waking
> it.
>
> However; we can use one of the 2 remaining bits to indicate the worker
> is woken, let's say bit 31. This then has server::next_tid always
> containing the current tid, even when the server has a 'spurious' wakeup
> for other things.
>
> Then all we need to do is modify the state check when the bit is set to
> ensure we don't wake the worker again if we race sys_umcg_wait() with a
> worker blocking.
>
> A bit like this, I suppose... (incompete patch in that it relies on a
> whole pile of local changes that might or might not live).
>
> --- a/include/uapi/linux/umcg.h
> +++ b/include/uapi/linux/umcg.h
> @@ -94,6 +94,8 @@ struct umcg_task {
>          */
>         __u32   state;                          /* r/w */
>
> +#define UMCG_TID_RUNNING       0x80000000U
> +#define UMCG_TID_MASK          0x3fffffffU
>         /**
>          * @next_tid: the TID of the UMCG task that should be context-switched
>          *            into in sys_umcg_wait(). Can be zero, in which case
> --- a/kernel/sched/umcg.c
> +++ b/kernel/sched/umcg.c
> @@ -20,7 +20,7 @@ static struct task_struct *umcg_get_task
>
>         if (tid) {
>                 rcu_read_lock();
> -               tsk = find_task_by_vpid(tid);
> +               tsk = find_task_by_vpid(tid & UMCG_TID_MASK);
>                 if (tsk && current->mm == tsk->mm && tsk->umcg_task)
>                         get_task_struct(tsk);
>                 else
> @@ -289,27 +291,6 @@ static int umcg_wake_task(struct task_st
>         return 0;
>  }
>
> -static int umcg_wake_next(struct task_struct *tsk)
> -{
> -       int ret = umcg_wake_task(tsk->umcg_next, tsk->umcg_next_task);
> -       if (ret)
> -               return ret;
> -
> -       /*
> -        * If userspace sets umcg_task::next_tid, it needs to remove
> -        * that task from the ready-queue to avoid another server
> -        * selecting it. However, that also means it needs to put it
> -        * back in case it went unused.
> -        *
> -        * By clearing the field on use, userspace can detect this case
> -        * and DTRT.
> -        */
> -       if (put_user(0u, &tsk->umcg_task->next_tid))
> -               return -EFAULT;
> -
> -       return 0;
> -}
> -
>  static int umcg_wake_server(struct task_struct *tsk)
>  {
>         int ret = umcg_wake_task(tsk->umcg_server, tsk->umcg_server_task);
> @@ -637,6 +599,48 @@ SYSCALL_DEFINE2(umcg_kick, u32, flags, p
>         return 0;
>  }
>
> +static int umcg_wake_next(struct task_struct *tsk, struct umcg_task __user *self)
> +{
> +       struct umcg_task __user *next_task;
> +       struct task_struct *next;
> +       u32 next_tid, state;
> +       int ret;
> +
> +       if (get_user(next_tid, &self->next_tid))
> +               return -EFAULT;
> +
> +       next = umcg_get_task(next_tid);
> +       if (!next)
> +               return -ESRCH;
> +
> +       next_task = READ_ONCE(next->umcg_task);
> +
> +       if (next_tid & UMCG_TID_RUNNING) {
> +               ret = -EFAULT;
> +               if (get_user(state, &next_task->state))
> +                       goto put_next;
> +
> +               ret = 0;
> +               if ((state & UMCG_TASK_MASK) != UMCG_TASK_RUNNING)
> +                       ret = -EAGAIN;
> +
> +       } else {
> +               ret = umcg_wake_task(next, next_task);
> +               if (ret)
> +                       goto put_next;
> +
> +               ret = -EFAULT;
> +               if (put_user(next_tid | UMCG_TID_RUNNING, &self->next_tid))
> +                       goto put_next;
> +
> +               ret = 0;
> +       }
> +
> +put_next:
> +       put_task_struct(next);
> +       return ret;
> +}
> +
>  /**
>   * sys_umcg_wait: transfer running context
>   *
>
>
> And once we have this, we can add sanity checks that server::next_tid is
> what it should be for ever worker moving away from RUNNING state (which
> depends on the assumption that all threads are in the same PID
> namespace).
>
>
> Does this make sense?

This is a long reply, touching several active discussion topics:

- cooperative userspace scheduling
- next_tid in workers
- worker timeouts
- signals and the general approach

=========== cooperative userspace scheduling

I think an important question to clear is about having worker timeouts
and worker-to-worker context switches (next_tid in workers). These are
actually fundamental, core features of cooperative user-space
scheduling. Yes, if UMCG is viewed simply as enabling what currently
exists in the kernel to be done in the userspace, then yes, worker
timeouts and worker.next_tid seem unneeded, and a runqueue per server,
with a single RUNNING worker per runqueue, makes the most natural
approach, the one that is taken in this new and improved patchset.

However, our experience of running many production workloads with
inprocess userspace scheduling over the last ~8 years indicate that
cooperative userspace scheduling features, built on top of
worker-to-worker context switches (with timeouts), are equally
important in driving performance gains and adoption.

============= worker-to-worker context switches

One example: absl::Mutex (https://abseil.io/about/design/mutex) has
google-internal extensions that are "fiber aware". More specifically,
consider this situation:

- worker W1 acqured the mutex and is doing its work
- worker W2 calls mutex::lock()
  mutex::lock(), being aware of workers, understands that W2 is going to sleep;
  so instead of just doing so, waking the server, and letting
  the server figure out what to run in place of the sleeping worker,
mutex::lock()
  calls into the userspace scheduler in the context of W2 running, and the
  userspace scheduler then picks W3 to run and does W2->W3 context switch.

The optimization above replaces W2->Server and Server->W3 context switches
with a single W2->W3 context switch, which is a material performance gain.

In addition, when W1 calls mutex::unlock(), the scheduling code determines
that W2 is waiting on the mutex, and thus calls W2::wake() from the context of
running W1 (you asked earlier why do we need "WAKE_ONLY").

There are several other similar "cooperative" worker-to-worker context switches
used in "handcrafted" userspace synchronization primitives, the most obvious
is a producer-consumer queue: the producer (W1) prepares an item or several,
and context-switches to the consumer (W2) for the latter to consume the items.

If the producer has more items to produce, it will call W2::wake() instead of
context-switching into it.

I hope the above discussion explains why worker-to-worker context switches,
as well as workers waking another workers, are useful/needed.

================ worker timeouts

Timeouts now are easy to explain: mutex::lock() and condvar::wait() have
timeouts, so workers waiting on these primitives naturally wait with timeouts;
if sys_umcg_wait() supports worker timeouts, this is it, all is simple; if
it does not, the userspace now has to implement the whole timeout machinery,
in order to wake these sleeping workers when their timeouts expire.

=========== signals and the general approach

My version of the patchset has all of these things working. What it
does not have,
compared to the new approach we are discussing here, is runqueues per server
and proper signal handling (and potential integration with proxy execution).

Runqueues per server, in the LAZY mode, are easy to emulate in my patchset:
nothing prevents the userspace to partition workers among servers, and have
servers that "own" their workers to be pointed at by idle_server_tid_ptr.

The only thing that is missing is proper treating of signals. But my patchset
does ensure a single running worker per server, had pagefaults and preemptions
sorted out, etc. Basically, everything works except signals. This patchet
has issues with pagefaults, worker timeouts, worker-to-worker context
switches (do workers move runqueues when they context switch?), etc.

And my patchset now actually looks smaller and simpler, on the kernel side,
that what this patchset is shaping up to be.

What if I fix signals in my patchset? I think the way you deal with signals
will work in my approach equally well; I'll also use umcg_kick() to preempt
workers instead of sending them a signal.

What do you think? If signals work in my patchset, why this new approach has
that my version does not?

Thanks,
Peter
Peter Zijlstra Jan. 19, 2022, 8:47 a.m. UTC | #12
On Tue, Jan 18, 2022 at 10:19:21AM -0800, Peter Oskolkov wrote:
> ============= worker-to-worker context switches
> 
> One example: absl::Mutex (https://abseil.io/about/design/mutex) has
> google-internal extensions that are "fiber aware". More specifically,
> consider this situation:
> 
> - worker W1 acqured the mutex and is doing its work
> - worker W2 calls mutex::lock()
>   mutex::lock(), being aware of workers, understands that W2 is going to sleep;
>   so instead of just doing so, waking the server, and letting
>   the server figure out what to run in place of the sleeping worker,
> mutex::lock()
>   calls into the userspace scheduler in the context of W2 running, and the
>   userspace scheduler then picks W3 to run and does W2->W3 context switch.
> 
> The optimization above replaces W2->Server and Server->W3 context switches
> with a single W2->W3 context switch, which is a material performance gain.

Yes, I've also already reconsidered. Things like pipelines and other
fixed order scheduling policies will greatly benefit from
worker-to-worker switching.

But I think all of them are explicit. That is, we can limit the
::next_tid usage to sys_umcg_wait() and never look at it for implicit
blocks.

> In addition, when W1 calls mutex::unlock(), the scheduling code determines
> that W2 is waiting on the mutex, and thus calls W2::wake() from the context of
> running W1 (you asked earlier why do we need "WAKE_ONLY").

This I'm not at all convinced on. That sounds like it will violate the
1:1 thing.
Peter Zijlstra Jan. 19, 2022, 8:51 a.m. UTC | #13
On Tue, Jan 18, 2022 at 10:19:21AM -0800, Peter Oskolkov wrote:

> ================ worker timeouts
> 
> Timeouts now are easy to explain: mutex::lock() and condvar::wait() have
> timeouts, so workers waiting on these primitives naturally wait with timeouts;
> if sys_umcg_wait() supports worker timeouts, this is it, all is simple; if
> it does not, the userspace now has to implement the whole timeout machinery,
> in order to wake these sleeping workers when their timeouts expire.

I still have absolutely no idea what you're on about here. Please reply
to the email on that subject:

  https://lkml.kernel.org/r/Ya34S2JCQg+81h4t@hirez.programming.kicks-ass.net

What would you have the timeout actually do? Talk about the state
transitions.
Peter Zijlstra Jan. 19, 2022, 8:59 a.m. UTC | #14
On Tue, Jan 18, 2022 at 10:19:21AM -0800, Peter Oskolkov wrote:

> =========== signals and the general approach
> 
> My version of the patchset has all of these things working. What it
> does not have,
> compared to the new approach we are discussing here, is runqueues per server
> and proper signal handling (and potential integration with proxy execution).
> 
> Runqueues per server, in the LAZY mode, are easy to emulate in my patchset:
> nothing prevents the userspace to partition workers among servers, and have
> servers that "own" their workers to be pointed at by idle_server_tid_ptr.
> 
> The only thing that is missing is proper treating of signals. But my patchset
> does ensure a single running worker per server, had pagefaults and preemptions
> sorted out, etc. Basically, everything works except signals. This patchet
> has issues with pagefaults, 

Already fixed pagefaults per:

  YeGvovSckivQnKX8@hirez.programming.kicks-ass.net

> worker timeouts

I still have no clear answer as to what you actually want there.

> , worker-to-worker context
> switches (do workers move runqueues when they context switch?), etc.

Not in kernel, if they need to be migrated, userspace needs to do that.

> And my patchset now actually looks smaller and simpler, on the kernel side,
> that what this patchset is shaping up to be.
> 
> What if I fix signals in my patchset? I think the way you deal with signals
> will work in my approach equally well; I'll also use umcg_kick() to preempt
> workers instead of sending them a signal.
> 
> What do you think? 

I still absolutely hate how long you do page pinning, it *will* wreck
things like CMA which are somewhat latency critical for silly things
like Android camera apps and who knows what else.

You've also forgotten about this:

  YcWutpu7BDeG+dQ2@hirez.programming.kicks-ass.net

That's not optional given how you're using page-pinning. Also, I think
we need at least one direct access to the page after getting the pin in
order to make it work.

That also very much limits it to Anon pages.
Peter Oskolkov Jan. 19, 2022, 5:33 p.m. UTC | #15
On Wed, Jan 19, 2022 at 12:47 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Tue, Jan 18, 2022 at 10:19:21AM -0800, Peter Oskolkov wrote:
> > ============= worker-to-worker context switches
> >
> > One example: absl::Mutex (https://abseil.io/about/design/mutex) has
> > google-internal extensions that are "fiber aware". More specifically,
> > consider this situation:
> >
> > - worker W1 acqured the mutex and is doing its work
> > - worker W2 calls mutex::lock()
> >   mutex::lock(), being aware of workers, understands that W2 is going to sleep;
> >   so instead of just doing so, waking the server, and letting
> >   the server figure out what to run in place of the sleeping worker,
> > mutex::lock()
> >   calls into the userspace scheduler in the context of W2 running, and the
> >   userspace scheduler then picks W3 to run and does W2->W3 context switch.
> >
> > The optimization above replaces W2->Server and Server->W3 context switches
> > with a single W2->W3 context switch, which is a material performance gain.
>
> Yes, I've also already reconsidered. Things like pipelines and other
> fixed order scheduling policies will greatly benefit from
> worker-to-worker switching.
>
> But I think all of them are explicit. That is, we can limit the
> ::next_tid usage to sys_umcg_wait() and never look at it for implicit
> blocks.

Yes, of course - when a worker blocks, its server gets notified.

>
> > In addition, when W1 calls mutex::unlock(), the scheduling code determines
> > that W2 is waiting on the mutex, and thus calls W2::wake() from the context of
> > running W1 (you asked earlier why do we need "WAKE_ONLY").
>
> This I'm not at all convinced on. That sounds like it will violate the
> 1:1 thing.

wake_only is a wakeup event, meaning the worker gets added to the wake
queue, not scheduled on a CPU; we don't have to implement it in the
kernel, though - the userspace may keep its own wake queue for workers
like this. So feel free to ignore this operation.
Peter Oskolkov Jan. 19, 2022, 5:52 p.m. UTC | #16
On Wed, Jan 19, 2022 at 1:00 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Tue, Jan 18, 2022 at 10:19:21AM -0800, Peter Oskolkov wrote:
>
> > =========== signals and the general approach
> >
> > My version of the patchset has all of these things working. What it
> > does not have,
> > compared to the new approach we are discussing here, is runqueues per server
> > and proper signal handling (and potential integration with proxy execution).
> >
> > Runqueues per server, in the LAZY mode, are easy to emulate in my patchset:
> > nothing prevents the userspace to partition workers among servers, and have
> > servers that "own" their workers to be pointed at by idle_server_tid_ptr.
> >
> > The only thing that is missing is proper treating of signals. But my patchset
> > does ensure a single running worker per server, had pagefaults and preemptions
> > sorted out, etc. Basically, everything works except signals. This patchet
> > has issues with pagefaults,
>
> Already fixed pagefaults per:
>
>   YeGvovSckivQnKX8@hirez.programming.kicks-ass.net

Could you, please, post an updated RFC when you have a chance? Thanks!

>
> > worker timeouts
>
> I still have no clear answer as to what you actually want there.
>
> > , worker-to-worker context
> > switches (do workers move runqueues when they context switch?), etc.
>
> Not in kernel, if they need to be migrated, userspace needs to do that.
>
> > And my patchset now actually looks smaller and simpler, on the kernel side,
> > that what this patchset is shaping up to be.
> >
> > What if I fix signals in my patchset? I think the way you deal with signals
> > will work in my approach equally well; I'll also use umcg_kick() to preempt
> > workers instead of sending them a signal.
> >
> > What do you think?
>
> I still absolutely hate how long you do page pinning, it *will* wreck
> things like CMA which are somewhat latency critical for silly things
> like Android camera apps and who knows what else.
>
> You've also forgotten about this:
>
>   YcWutpu7BDeG+dQ2@hirez.programming.kicks-ass.net
>
> That's not optional given how you're using page-pinning. Also, I think
> we need at least one direct access to the page after getting the pin in
> order to make it work.
>
> That also very much limits it to Anon pages.

I can use the same mm/page pinning strategy as you do. But then our
patchsets will be quite similar, I guess, with the difference being
server wakeups with RUNNING workers vs "lazy" idle_server_tid_ptr. So
OK, let's continue with your approach. If you could post a new RFC
with the memory/paging fixes in it, I'll then add worker timeouts, as
I outlined in a separate email ~ 30min ago, and continue with my
integration/testing.
Peter Zijlstra Jan. 20, 2022, 10:37 a.m. UTC | #17
On Wed, Jan 19, 2022 at 09:52:30AM -0800, Peter Oskolkov wrote:
> Could you, please, post an updated RFC when you have a chance? Thanks!

I was working on that, I'll try and get it done today.
diff mbox series

Patch

--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -248,6 +248,7 @@  config X86
 	select HAVE_RSEQ
 	select HAVE_SYSCALL_TRACEPOINTS
 	select HAVE_UNSTABLE_SCHED_CLOCK
+	select HAVE_UMCG			if X86_64
 	select HAVE_USER_RETURN_NOTIFIER
 	select HAVE_GENERIC_VDSO
 	select HOTPLUG_SMT			if SMP
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -371,6 +371,9 @@ 
 447	common	memfd_secret		sys_memfd_secret
 448	common	process_mrelease	sys_process_mrelease
 449	common	futex_waitv		sys_futex_waitv
+450	common	umcg_ctl		sys_umcg_ctl
+451	common	umcg_wait		sys_umcg_wait
+452	common	umcg_kick		sys_umcg_kick
 
 #
 # Due to a historical design error, certain syscalls are numbered differently
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -83,6 +83,7 @@  struct thread_info {
 #define TIF_NEED_RESCHED	3	/* rescheduling necessary */
 #define TIF_SINGLESTEP		4	/* reenable singlestep on user return*/
 #define TIF_SSBD		5	/* Speculative store bypass disable */
+#define TIF_UMCG		6	/* UMCG return to user hook */
 #define TIF_SPEC_IB		9	/* Indirect branch speculation mitigation */
 #define TIF_SPEC_L1D_FLUSH	10	/* Flush L1D on mm switches (processes) */
 #define TIF_USER_RETURN_NOTIFY	11	/* notify kernel of userspace return */
@@ -107,6 +108,7 @@  struct thread_info {
 #define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
 #define _TIF_SINGLESTEP		(1 << TIF_SINGLESTEP)
 #define _TIF_SSBD		(1 << TIF_SSBD)
+#define _TIF_UMCG		(1 << TIF_UMCG)
 #define _TIF_SPEC_IB		(1 << TIF_SPEC_IB)
 #define _TIF_SPEC_L1D_FLUSH	(1 << TIF_SPEC_L1D_FLUSH)
 #define _TIF_USER_RETURN_NOTIFY	(1 << TIF_USER_RETURN_NOTIFY)
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1838,6 +1838,7 @@  static int bprm_execve(struct linux_binp
 	current->fs->in_exec = 0;
 	current->in_execve = 0;
 	rseq_execve(current);
+	umcg_execve(current);
 	acct_update_integrals(current);
 	task_numa_free(current, false);
 	return retval;
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -22,6 +22,10 @@ 
 # define _TIF_UPROBE			(0)
 #endif
 
+#ifndef _TIF_UMCG
+# define _TIF_UMCG			(0)
+#endif
+
 /*
  * SYSCALL_WORK flags handled in syscall_enter_from_user_mode()
  */
@@ -42,11 +46,13 @@ 
 				 SYSCALL_WORK_SYSCALL_EMU |		\
 				 SYSCALL_WORK_SYSCALL_AUDIT |		\
 				 SYSCALL_WORK_SYSCALL_USER_DISPATCH |	\
+				 SYSCALL_WORK_SYSCALL_UMCG |		\
 				 ARCH_SYSCALL_WORK_ENTER)
 #define SYSCALL_WORK_EXIT	(SYSCALL_WORK_SYSCALL_TRACEPOINT |	\
 				 SYSCALL_WORK_SYSCALL_TRACE |		\
 				 SYSCALL_WORK_SYSCALL_AUDIT |		\
 				 SYSCALL_WORK_SYSCALL_USER_DISPATCH |	\
+				 SYSCALL_WORK_SYSCALL_UMCG |		\
 				 SYSCALL_WORK_SYSCALL_EXIT_TRAP	|	\
 				 ARCH_SYSCALL_WORK_EXIT)
 
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -67,6 +67,7 @@  struct sighand_struct;
 struct signal_struct;
 struct task_delay_info;
 struct task_group;
+struct umcg_task;
 
 /*
  * Task state bitmask. NOTE! These bits are also
@@ -1294,6 +1295,23 @@  struct task_struct {
 	unsigned long rseq_event_mask;
 #endif
 
+#ifdef CONFIG_UMCG
+	/* setup by sys_umcg_ctrl() */
+	clockid_t		umcg_clock;
+	struct umcg_task __user	*umcg_task;
+
+	/* setup by umcg_pin_enter() */
+	struct page		*umcg_worker_page;
+
+	struct task_struct	*umcg_server;
+	struct umcg_task __user *umcg_server_task;
+	struct page		*umcg_server_page;
+
+	struct task_struct	*umcg_next;
+	struct umcg_task __user	*umcg_next_task;
+	struct page		*umcg_next_page;
+#endif
+
 	struct tlbflush_unmap_batch	tlb_ubc;
 
 	union {
@@ -1687,6 +1705,13 @@  extern struct pid *cad_pid;
 #define PF_KTHREAD		0x00200000	/* I am a kernel thread */
 #define PF_RANDOMIZE		0x00400000	/* Randomize virtual address space */
 #define PF_SWAPWRITE		0x00800000	/* Allowed to write to swap */
+
+#ifdef CONFIG_UMCG
+#define PF_UMCG_WORKER		0x01000000	/* UMCG worker */
+#else
+#define PF_UMCG_WORKER		0x00000000
+#endif
+
 #define PF_NO_SETAFFINITY	0x04000000	/* Userland is not allowed to meddle with cpus_mask */
 #define PF_MCE_EARLY		0x08000000      /* Early kill for mce process policy */
 #define PF_MEMALLOC_PIN		0x10000000	/* Allocation context constrained to zones which allow long term pinning. */
@@ -2294,6 +2319,67 @@  static inline void rseq_execve(struct ta
 {
 }
 
+#endif
+
+#ifdef CONFIG_UMCG
+
+extern void umcg_sys_enter(struct pt_regs *regs, long syscall);
+extern void umcg_sys_exit(struct pt_regs *regs);
+extern void umcg_notify_resume(struct pt_regs *regs);
+extern void umcg_worker_exit(void);
+extern void umcg_clear_child(struct task_struct *tsk);
+
+/* Called by bprm_execve() in fs/exec.c. */
+static inline void umcg_execve(struct task_struct *tsk)
+{
+	if (tsk->umcg_task)
+		umcg_clear_child(tsk);
+}
+
+/* Called by do_exit() in kernel/exit.c. */
+static inline void umcg_handle_exit(void)
+{
+	if (current->flags & PF_UMCG_WORKER)
+		umcg_worker_exit();
+}
+
+/*
+ * umcg_wq_worker_[sleeping|running] are called in core.c by
+ * sched_submit_work() and sched_update_worker().
+ */
+extern void umcg_wq_worker_sleeping(struct task_struct *tsk);
+extern void umcg_wq_worker_running(struct task_struct *tsk);
+
+#else  /* CONFIG_UMCG */
+
+static inline void umcg_sys_enter(struct pt_regs *regs, long syscall)
+{
+}
+
+static inline void umcg_sys_exit(struct pt_regs *regs)
+{
+}
+
+static inline void umcg_notify_resume(struct pt_regs *regs)
+{
+}
+
+static inline void umcg_clear_child(struct task_struct *tsk)
+{
+}
+static inline void umcg_execve(struct task_struct *tsk)
+{
+}
+static inline void umcg_handle_exit(void)
+{
+}
+static inline void umcg_wq_worker_sleeping(struct task_struct *tsk)
+{
+}
+static inline void umcg_wq_worker_running(struct task_struct *tsk)
+{
+}
+
 #endif
 
 #ifdef CONFIG_DEBUG_RSEQ
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -72,6 +72,7 @@  struct open_how;
 struct mount_attr;
 struct landlock_ruleset_attr;
 enum landlock_rule_type;
+struct umcg_task;
 
 #include <linux/types.h>
 #include <linux/aio_abi.h>
@@ -1057,6 +1058,9 @@  asmlinkage long sys_landlock_add_rule(in
 		const void __user *rule_attr, __u32 flags);
 asmlinkage long sys_landlock_restrict_self(int ruleset_fd, __u32 flags);
 asmlinkage long sys_memfd_secret(unsigned int flags);
+asmlinkage long sys_umcg_ctl(u32 flags, struct umcg_task __user *self, clockid_t which_clock);
+asmlinkage long sys_umcg_wait(u32 flags, u64 abs_timeout);
+asmlinkage long sys_umcg_kick(u32 flags, pid_t tid);
 
 /*
  * Architecture-specific system calls
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -46,6 +46,7 @@  enum syscall_work_bit {
 	SYSCALL_WORK_BIT_SYSCALL_AUDIT,
 	SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH,
 	SYSCALL_WORK_BIT_SYSCALL_EXIT_TRAP,
+	SYSCALL_WORK_BIT_SYSCALL_UMCG,
 };
 
 #define SYSCALL_WORK_SECCOMP		BIT(SYSCALL_WORK_BIT_SECCOMP)
@@ -55,6 +56,7 @@  enum syscall_work_bit {
 #define SYSCALL_WORK_SYSCALL_AUDIT	BIT(SYSCALL_WORK_BIT_SYSCALL_AUDIT)
 #define SYSCALL_WORK_SYSCALL_USER_DISPATCH BIT(SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH)
 #define SYSCALL_WORK_SYSCALL_EXIT_TRAP	BIT(SYSCALL_WORK_BIT_SYSCALL_EXIT_TRAP)
+#define SYSCALL_WORK_SYSCALL_UMCG	BIT(SYSCALL_WORK_BIT_SYSCALL_UMCG)
 #endif
 
 #include <asm/thread_info.h>
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -883,8 +883,15 @@  __SYSCALL(__NR_process_mrelease, sys_pro
 #define __NR_futex_waitv 449
 __SYSCALL(__NR_futex_waitv, sys_futex_waitv)
 
+#define __NR_umcg_ctl 450
+__SYSCALL(__NR_umcg_ctl, sys_umcg_ctl)
+#define __NR_umcg_wait 451
+__SYSCALL(__NR_umcg_wait, sys_umcg_wait)
+#define __NR_umcg_kick 452
+__SYSCALL(__NR_umcg_kick, sys_umcg_kick)
+
 #undef __NR_syscalls
-#define __NR_syscalls 450
+#define __NR_syscalls 453
 
 /*
  * 32 bit systems traditionally used different
--- /dev/null
+++ b/include/uapi/linux/umcg.h
@@ -0,0 +1,143 @@ 
+/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
+#ifndef _UAPI_LINUX_UMCG_H
+#define _UAPI_LINUX_UMCG_H
+
+#include <linux/types.h>
+
+/*
+ * UMCG: User Managed Concurrency Groups.
+ *
+ * Syscalls (see kernel/sched/umcg.c):
+ *      sys_umcg_ctl()  - register/unregister UMCG tasks;
+ *      sys_umcg_wait() - wait/wake/context-switch.
+ *      sys_umcg_kick() - prod a UMCG task
+ *
+ * struct umcg_task (below): controls the state of UMCG tasks.
+ */
+
+/*
+ * UMCG task states, the first 6 bits of struct umcg_task.state_ts.
+ * The states represent the user space point of view.
+ *
+ *   ,--------(TF_PREEMPT + notify_resume)-------. ,------------.
+ *   |                                           v |            |
+ * RUNNING -(schedule)-> BLOCKED -(sys_exit)-> RUNNABLE  (signal + notify_resume)
+ *   ^                                           | ^            |
+ *   `--------------(sys_umcg_wait)--------------' `------------'
+ *
+ */
+#define UMCG_TASK_NONE			0x0000U
+#define UMCG_TASK_RUNNING		0x0001U
+#define UMCG_TASK_RUNNABLE		0x0002U
+#define UMCG_TASK_BLOCKED		0x0003U
+
+#define UMCG_TASK_MASK			0x00ffU
+
+/*
+ * UMCG_TF_PREEMPT: userspace indicates the worker should be preempted.
+ *
+ * Must only be set on UMCG_TASK_RUNNING; once set, any subsequent
+ * return-to-user (eg sys_umcg_kick()) will perform the equivalent of
+ * sys_umcg_wait() on it. That is, it will wake next_tid/server_tid, transfer
+ * to RUNNABLE and enqueue on the server's runnable list.
+ */
+#define UMCG_TF_PREEMPT			0x0100U
+/*
+ * UMCG_TF_COND_WAIT: indicate the task *will* call sys_umcg_wait()
+ *
+ * Enables server loops like (vs umcg_sys_exit()):
+ *
+ *   for(;;) {
+ *	self->status = UMCG_TASK_RUNNABLE | UMCG_TF_COND_WAIT;
+ *	// smp_mb() implied by xchg()
+ *
+ *	runnable_ptr = xchg(self->runnable_workers_ptr, NULL);
+ *	while (runnable_ptr) {
+ *		next = runnable_ptr->runnable_workers_ptr;
+ *
+ *		umcg_server_add_runnable(self, runnable_ptr);
+ *
+ *		runnable_ptr = next;
+ *	}
+ *
+ *	self->next = umcg_server_pick_next(self);
+ *	sys_umcg_wait(0, 0);
+ *   }
+ *
+ * without a signal or interrupt in between setting umcg_task::state and
+ * sys_umcg_wait() resulting in an infinite wait in umcg_notify_resume().
+ */
+#define UMCG_TF_COND_WAIT		0x0200U
+
+#define UMCG_TF_MASK			0xff00U
+
+#define UMCG_TASK_ALIGN			64
+
+/**
+ * struct umcg_task - controls the state of UMCG tasks.
+ *
+ * The struct is aligned at 64 bytes to ensure that it fits into
+ * a single cache line.
+ */
+struct umcg_task {
+	/**
+	 * @state_ts: the current state of the UMCG task described by
+	 *            this struct, with a unique timestamp indicating
+	 *            when the last state change happened.
+	 *
+	 * Readable/writable by both the kernel and the userspace.
+	 *
+	 * UMCG task state:
+	 *   bits  0 -  7: task state;
+	 *   bits  8 - 15: state flags;
+	 *   bits 16 - 31: for userspace use;
+	 */
+	__u32	state;				/* r/w */
+
+	/**
+	 * @next_tid: the TID of the UMCG task that should be context-switched
+	 *            into in sys_umcg_wait(). Can be zero, in which case
+	 *            it'll switch to server_tid.
+	 *
+	 * @server_tid: the TID of the UMCG server that hosts this task,
+	 *		when RUNNABLE this task will get added to it's
+	 *		runnable_workers_ptr list.
+	 *
+	 * Read-only for the kernel, read/write for the userspace.
+	 */
+	__u32	next_tid;			/* r   */
+	__u32	server_tid;			/* r   */
+
+	__u32	__hole[1];
+
+	/*
+	 * Timestamps for when last we became BLOCKED, RUNNABLE, in CLOCK_MONOTONIC.
+	 */
+	__u64	blocked_ts;			/*   w */
+	__u64   runnable_ts;			/*   w */
+
+	/**
+	 * @runnable_workers_ptr: a single-linked list of runnable workers.
+	 *
+	 * Readable/writable by both the kernel and the userspace: the
+	 * kernel adds items to the list, userspace removes them.
+	 */
+	__u64	runnable_workers_ptr;		/* r/w */
+
+	__u64	__zero[3];
+
+} __attribute__((packed, aligned(UMCG_TASK_ALIGN)));
+
+/**
+ * enum umcg_ctl_flag - flags to pass to sys_umcg_ctl
+ * @UMCG_CTL_REGISTER:   register the current task as a UMCG task
+ * @UMCG_CTL_UNREGISTER: unregister the current task as a UMCG task
+ * @UMCG_CTL_WORKER:     register the current task as a UMCG worker
+ */
+enum umcg_ctl_flag {
+	UMCG_CTL_REGISTER	= 0x00001,
+	UMCG_CTL_UNREGISTER	= 0x00002,
+	UMCG_CTL_WORKER		= 0x10000,
+};
+
+#endif /* _UAPI_LINUX_UMCG_H */
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1686,6 +1686,21 @@  config MEMBARRIER
 
 	  If unsure, say Y.
 
+config HAVE_UMCG
+	bool
+
+config UMCG
+	bool "Enable User Managed Concurrency Groups API"
+	depends on 64BIT
+	depends on GENERIC_ENTRY
+	depends on HAVE_UMCG
+	default n
+	help
+	  Enable User Managed Concurrency Groups API, which form the basis
+	  for an in-process M:N userspace scheduling framework.
+	  At the moment this is an experimental/RFC feature that is not
+	  guaranteed to be backward-compatible.
+
 config KALLSYMS
 	bool "Load all symbols for debugging/ksymoops" if EXPERT
 	default y
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -6,6 +6,7 @@ 
 #include <linux/livepatch.h>
 #include <linux/audit.h>
 #include <linux/tick.h>
+#include <linux/sched.h>
 
 #include "common.h"
 
@@ -76,6 +77,9 @@  static long syscall_trace_enter(struct p
 	if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT))
 		trace_sys_enter(regs, syscall);
 
+	if (work & SYSCALL_WORK_SYSCALL_UMCG)
+		umcg_sys_enter(regs, syscall);
+
 	syscall_enter_audit(regs, syscall);
 
 	return ret ? : syscall;
@@ -155,8 +159,7 @@  static unsigned long exit_to_user_mode_l
 	 * Before returning to user space ensure that all pending work
 	 * items have been completed.
 	 */
-	while (ti_work & EXIT_TO_USER_MODE_WORK) {
-
+	do {
 		local_irq_enable_exit_to_user(ti_work);
 
 		if (ti_work & _TIF_NEED_RESCHED)
@@ -168,6 +171,10 @@  static unsigned long exit_to_user_mode_l
 		if (ti_work & _TIF_PATCH_PENDING)
 			klp_update_patch_state(current);
 
+		/* must be before handle_signal_work(); terminates on sigpending */
+		if (ti_work & _TIF_UMCG)
+			umcg_notify_resume(regs);
+
 		if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
 			handle_signal_work(regs, ti_work);
 
@@ -188,7 +195,7 @@  static unsigned long exit_to_user_mode_l
 		tick_nohz_user_enter_prepare();
 
 		ti_work = read_thread_flags();
-	}
+	} while (ti_work & EXIT_TO_USER_MODE_WORK);
 
 	/* Return the latest work state for arch_exit_to_user_mode() */
 	return ti_work;
@@ -203,7 +210,7 @@  static void exit_to_user_mode_prepare(st
 	/* Flush pending rcuog wakeup before the last need_resched() check */
 	tick_nohz_user_enter_prepare();
 
-	if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
+	if (unlikely(ti_work & (EXIT_TO_USER_MODE_WORK | _TIF_UMCG)))
 		ti_work = exit_to_user_mode_loop(regs, ti_work);
 
 	arch_exit_to_user_mode_prepare(regs, ti_work);
@@ -253,6 +260,9 @@  static void syscall_exit_work(struct pt_
 	step = report_single_step(work);
 	if (step || work & SYSCALL_WORK_SYSCALL_TRACE)
 		arch_syscall_exit_tracehook(regs, step);
+
+	if (work & SYSCALL_WORK_SYSCALL_UMCG)
+		umcg_sys_exit(regs);
 }
 
 /*
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -749,6 +749,10 @@  void __noreturn do_exit(long code)
 	if (unlikely(!tsk->pid))
 		panic("Attempted to kill the idle task!");
 
+	/* Turn off UMCG sched hooks. */
+	if (unlikely(tsk->flags & PF_UMCG_WORKER))
+		tsk->flags &= ~PF_UMCG_WORKER;
+
 	/*
 	 * If do_exit is called because this processes oopsed, it's possible
 	 * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
@@ -786,6 +790,7 @@  void __noreturn do_exit(long code)
 
 	io_uring_files_cancel();
 	exit_signals(tsk);  /* sets PF_EXITING */
+	umcg_handle_exit();
 
 	/* sync mm's RSS info before statistics gathering */
 	if (tsk->mm)
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -41,3 +41,4 @@  obj-$(CONFIG_MEMBARRIER) += membarrier.o
 obj-$(CONFIG_CPU_ISOLATION) += isolation.o
 obj-$(CONFIG_PSI) += psi.o
 obj-$(CONFIG_SCHED_CORE) += core_sched.o
+obj-$(CONFIG_UMCG) += umcg.o
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4272,6 +4272,7 @@  static void __sched_fork(unsigned long c
 	p->wake_entry.u_flags = CSD_TYPE_TTWU;
 	p->migration_pending = NULL;
 #endif
+	umcg_clear_child(p);
 }
 
 DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
@@ -6330,9 +6331,11 @@  static inline void sched_submit_work(str
 	 * If a worker goes to sleep, notify and ask workqueue whether it
 	 * wants to wake up a task to maintain concurrency.
 	 */
-	if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
+	if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER | PF_UMCG_WORKER)) {
 		if (task_flags & PF_WQ_WORKER)
 			wq_worker_sleeping(tsk);
+		else if (task_flags & PF_UMCG_WORKER)
+			umcg_wq_worker_sleeping(tsk);
 		else
 			io_wq_worker_sleeping(tsk);
 	}
@@ -6350,9 +6353,11 @@  static inline void sched_submit_work(str
 
 static void sched_update_worker(struct task_struct *tsk)
 {
-	if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
+	if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER | PF_UMCG_WORKER)) {
 		if (tsk->flags & PF_WQ_WORKER)
 			wq_worker_running(tsk);
+		else if (tsk->flags & PF_UMCG_WORKER)
+			umcg_wq_worker_running(tsk);
 		else
 			io_wq_worker_running(tsk);
 	}
--- /dev/null
+++ b/kernel/sched/umcg.c
@@ -0,0 +1,868 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+
+/*
+ * User Managed Concurrency Groups (UMCG).
+ *
+ */
+
+#include <linux/syscalls.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/umcg.h>
+
+#include <asm/syscall.h>
+
+#include "sched.h"
+
+static struct task_struct *umcg_get_task(u32 tid)
+{
+	struct task_struct *tsk = NULL;
+
+	if (tid) {
+		rcu_read_lock();
+		tsk = find_task_by_vpid(tid);
+		if (tsk && current->mm == tsk->mm && tsk->umcg_task)
+			get_task_struct(tsk);
+		else
+			tsk = NULL;
+		rcu_read_unlock();
+	}
+
+	return tsk;
+}
+
+/**
+ * umcg_pin_pages: pin pages containing struct umcg_task of
+ *		   this task, its server (possibly this task again)
+ *		   and the next (possibly NULL).
+ */
+static int umcg_pin_pages(void)
+{
+	struct task_struct *server = NULL, *next = NULL, *tsk = current;
+	struct umcg_task __user *self = READ_ONCE(tsk->umcg_task);
+	int server_tid, next_tid;
+	int ret;
+
+	/* must not have stale state */
+	if (WARN_ON_ONCE(tsk->umcg_worker_page ||
+			 tsk->umcg_server_page ||
+			 tsk->umcg_next_page   ||
+			 tsk->umcg_server_task ||
+			 tsk->umcg_next_task   ||
+			 tsk->umcg_server      ||
+			 tsk->umcg_next))
+		return -EBUSY;
+
+	ret = -EFAULT;
+	if (pin_user_pages_fast((unsigned long)self, 1, 0,
+				&tsk->umcg_worker_page) != 1)
+		goto clear_self;
+
+	if (get_user(server_tid, &self->server_tid))
+		goto unpin_self;
+
+	ret = -ESRCH;
+	server = umcg_get_task(server_tid);
+	if (!server)
+		goto unpin_self;
+
+	ret = -EFAULT;
+	/* must cache due to possible concurrent change vs access_ok() */
+	tsk->umcg_server_task = READ_ONCE(server->umcg_task);
+	if (pin_user_pages_fast((unsigned long)tsk->umcg_server_task, 1, 0,
+				&tsk->umcg_server_page) != 1)
+		goto clear_server;
+
+	tsk->umcg_server = server;
+
+	if (get_user(next_tid, &self->next_tid))
+		goto unpin_server;
+
+	if (!next_tid)
+		goto done;
+
+	ret = -ESRCH;
+	next = umcg_get_task(next_tid);
+	if (!next)
+		goto unpin_server;
+
+	ret = -EFAULT;
+	tsk->umcg_next_task = READ_ONCE(next->umcg_task);
+	if (pin_user_pages_fast((unsigned long)tsk->umcg_next_task, 1, 0,
+				&tsk->umcg_next_page) != 1)
+		goto clear_next;
+
+	tsk->umcg_next = next;
+
+done:
+	return 0;
+
+clear_next:
+	tsk->umcg_next_task = NULL;
+	tsk->umcg_next_page = NULL;
+
+unpin_server:
+	unpin_user_page(tsk->umcg_server_page);
+
+clear_server:
+	tsk->umcg_server_task = NULL;
+	tsk->umcg_server_page = NULL;
+
+unpin_self:
+	unpin_user_page(tsk->umcg_worker_page);
+clear_self:
+	tsk->umcg_worker_page = NULL;
+
+	return ret;
+}
+
+static void umcg_unpin_pages(void)
+{
+	struct task_struct *tsk = current;
+
+	if (tsk->umcg_server) {
+		unpin_user_page(tsk->umcg_worker_page);
+		tsk->umcg_worker_page = NULL;
+
+		unpin_user_page(tsk->umcg_server_page);
+		tsk->umcg_server_page = NULL;
+		tsk->umcg_server_task = NULL;
+
+		put_task_struct(tsk->umcg_server);
+		tsk->umcg_server = NULL;
+
+		if (tsk->umcg_next) {
+			unpin_user_page(tsk->umcg_next_page);
+			tsk->umcg_next_page = NULL;
+			tsk->umcg_next_task = NULL;
+
+			put_task_struct(tsk->umcg_next);
+			tsk->umcg_next = NULL;
+		}
+	}
+}
+
+static void umcg_clear_task(struct task_struct *tsk)
+{
+	/*
+	 * This is either called for the current task, or for a newly forked
+	 * task that is not yet running, so we don't need strict atomicity
+	 * below.
+	 */
+	if (tsk->umcg_task) {
+		WRITE_ONCE(tsk->umcg_task, NULL);
+		tsk->umcg_worker_page = NULL;
+
+		tsk->umcg_server = NULL;
+		tsk->umcg_server_page = NULL;
+		tsk->umcg_server_task = NULL;
+
+		tsk->umcg_next = NULL;
+		tsk->umcg_next_page = NULL;
+		tsk->umcg_next_task = NULL;
+
+		tsk->flags &= ~PF_UMCG_WORKER;
+		clear_task_syscall_work(tsk, SYSCALL_UMCG);
+		clear_tsk_thread_flag(tsk, TIF_UMCG);
+	}
+}
+
+/* Called for a forked or execve-ed child. */
+void umcg_clear_child(struct task_struct *tsk)
+{
+	umcg_clear_task(tsk);
+}
+
+/* Called both by normally (unregister) and abnormally exiting workers. */
+void umcg_worker_exit(void)
+{
+	umcg_unpin_pages();
+	umcg_clear_task(current);
+}
+
+/*
+ * Do a state transition: @from -> @to.
+ *
+ * Will clear UMCG_TF_PREEMPT, UMCG_TF_COND_WAIT.
+ *
+ * When @to == {BLOCKED,RUNNABLE}, update timestamps.
+ *
+ * Returns:
+ *   0: success
+ *   -EAGAIN: when self->state != @from
+ *   -EFAULT
+ */
+static int umcg_update_state(struct task_struct *tsk,
+			     struct umcg_task __user *self,
+			     u32 from, u32 to)
+{
+	u32 old, new;
+	u64 now;
+
+	if (to >= UMCG_TASK_RUNNABLE) {
+		switch (tsk->umcg_clock) {
+		case CLOCK_REALTIME:      now = ktime_get_real_ns();     break;
+		case CLOCK_MONOTONIC:     now = ktime_get_ns();          break;
+		case CLOCK_BOOTTIME:      now = ktime_get_boottime_ns(); break;
+		case CLOCK_TAI:           now = ktime_get_clocktai_ns(); break;
+		}
+	}
+
+	if (!user_access_begin(self, sizeof(*self)))
+		return -EFAULT;
+
+	unsafe_get_user(old, &self->state, Efault);
+	do {
+		if ((old & UMCG_TASK_MASK) != from)
+			goto fail;
+
+		new = old & ~(UMCG_TASK_MASK |
+			      UMCG_TF_PREEMPT | UMCG_TF_COND_WAIT);
+		new |= to & UMCG_TASK_MASK;
+
+	} while (!unsafe_try_cmpxchg_user(&self->state, &old, new, Efault));
+
+	if (to == UMCG_TASK_BLOCKED)
+		unsafe_put_user(now, &self->blocked_ts, Efault);
+	if (to == UMCG_TASK_RUNNABLE)
+		unsafe_put_user(now, &self->runnable_ts, Efault);
+
+	user_access_end();
+	return 0;
+
+fail:
+	user_access_end();
+	return -EAGAIN;
+
+Efault:
+	user_access_end();
+	return -EFAULT;
+}
+
+#define __UMCG_DIE(stmt, reason)	do {				\
+	stmt;								\
+	pr_warn_ratelimited("%s: killing task %s/%d because: " reason "\n",\
+			    __func__, current->comm, current->pid);	\
+	force_sig(SIGKILL);						\
+	return;								\
+} while (0)
+
+#define UMCG_DIE(reason)	__UMCG_DIE(,reason)
+#define UMCG_DIE_PF(reason)	__UMCG_DIE(pagefault_enable(), reason)
+#define UMCG_DIE_UNPIN(reason)	__UMCG_DIE(umcg_unpin_pages(), reason)
+
+/* Called from syscall enter path */
+void umcg_sys_enter(struct pt_regs *regs, long syscall)
+{
+	/* avoid recursion vs our own syscalls */
+	if (syscall == __NR_umcg_wait ||
+	    syscall == __NR_umcg_ctl)
+		return;
+
+	/* avoid recursion vs schedule() */
+	current->flags &= ~PF_UMCG_WORKER;
+
+	/*
+	 * Pin all the state on sys_enter() such that we can rely on it
+	 * from dodgy contexts. It is either unpinned from pre-schedule()
+	 * or sys_exit(), whichever comes first, thereby ensuring the pin
+	 * is temporary.
+	 */
+	if (umcg_pin_pages())
+		UMCG_DIE("pin");
+
+	current->flags |= PF_UMCG_WORKER;
+}
+
+static int umcg_wake_task(struct task_struct *tsk, struct umcg_task __user *self)
+{
+	int ret = umcg_update_state(tsk, self, UMCG_TASK_RUNNABLE, UMCG_TASK_RUNNING);
+	if (ret)
+		return ret;
+
+	try_to_wake_up(tsk, TASK_NORMAL, WF_CURRENT_CPU);
+	return 0;
+}
+
+static int umcg_wake_next(struct task_struct *tsk)
+{
+	int ret = umcg_wake_task(tsk->umcg_next, tsk->umcg_next_task);
+	if (ret)
+		return ret;
+
+	/*
+	 * If userspace sets umcg_task::next_tid, it needs to remove
+	 * that task from the ready-queue to avoid another server
+	 * selecting it. However, that also means it needs to put it
+	 * back in case it went unused.
+	 *
+	 * By clearing the field on use, userspace can detect this case
+	 * and DTRT.
+	 */
+	if (put_user(0u, &tsk->umcg_task->next_tid))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int umcg_wake_server(struct task_struct *tsk)
+{
+	int ret = umcg_wake_task(tsk->umcg_server, tsk->umcg_server_task);
+	switch (ret) {
+	case 0:
+	case -EAGAIN:
+		/*
+		 * Server could have timed-out or already be running
+		 * due to a runnable enqueue. See umcg_sys_exit().
+		 */
+		break;
+
+	default:
+		return ret;
+	}
+
+	return 0;
+}
+
+/*
+ * Wake ::next_tid or ::server_tid.
+ *
+ * Must be called in umcg_pin_pages() context, relies on
+ * tsk->umcg_{server,next}.
+ *
+ * Returns:
+ *   0: success
+ *   -EAGAIN
+ *   -EFAULT
+ */
+static int umcg_wake(struct task_struct *tsk)
+{
+	if (tsk->umcg_next)
+		return umcg_wake_next(tsk);
+
+	return umcg_wake_server(tsk);
+}
+
+/* pre-schedule() */
+void umcg_wq_worker_sleeping(struct task_struct *tsk)
+{
+	struct umcg_task __user *self = READ_ONCE(tsk->umcg_task);
+
+	/* Must not fault, mmap_sem might be held. */
+	pagefault_disable();
+
+	if (WARN_ON_ONCE(!tsk->umcg_server))
+		UMCG_DIE_PF("no server");
+
+	if (umcg_update_state(tsk, self, UMCG_TASK_RUNNING, UMCG_TASK_BLOCKED))
+		UMCG_DIE_PF("state");
+
+	if (umcg_wake(tsk))
+		UMCG_DIE_PF("wake");
+
+	pagefault_enable();
+
+	/*
+	 * We're going to sleep, make sure to unpin the pages, this ensures
+	 * the pins are temporary. Also see umcg_sys_exit().
+	 */
+	umcg_unpin_pages();
+}
+
+/* post-schedule() */
+void umcg_wq_worker_running(struct task_struct *tsk)
+{
+	/* nothing here, see umcg_sys_exit() */
+}
+
+/*
+ * Enqueue @tsk on it's server's runnable list
+ *
+ * Must be called in umcg_pin_pages() context, relies on tsk->umcg_server.
+ *
+ * cmpxchg based single linked list add such that list integrity is never
+ * violated.  Userspace *MUST* remove it from the list before changing ->state.
+ * As such, we must change state to RUNNABLE before enqueue.
+ *
+ * Returns:
+ *   0: success
+ *   -EFAULT
+ */
+static int umcg_enqueue_runnable(struct task_struct *tsk)
+{
+	struct umcg_task __user *server = tsk->umcg_server_task;
+	struct umcg_task __user *self = tsk->umcg_task;
+	u64 self_ptr = (unsigned long)self;
+	u64 first_ptr;
+
+	/*
+	 * umcg_pin_pages() did access_ok() on both pointers, use self here
+	 * only because __user_access_begin() isn't available in generic code.
+	 */
+	if (!user_access_begin(self, sizeof(*self)))
+		return -EFAULT;
+
+	unsafe_get_user(first_ptr, &server->runnable_workers_ptr, Efault);
+	do {
+		unsafe_put_user(first_ptr, &self->runnable_workers_ptr, Efault);
+	} while (!unsafe_try_cmpxchg_user(&server->runnable_workers_ptr, &first_ptr, self_ptr, Efault));
+
+	user_access_end();
+	return 0;
+
+Efault:
+	user_access_end();
+	return -EFAULT;
+}
+
+/*
+ * umcg_wait: Wait for ->state to become RUNNING
+ *
+ * Returns:
+ * 0		- success
+ * -EINTR	- pending signal
+ * -EINVAL	- ::state is not {RUNNABLE,RUNNING}
+ * -ETIMEDOUT
+ * -EFAULT
+ */
+int umcg_wait(u64 timo)
+{
+	struct task_struct *tsk = current;
+	struct umcg_task __user *self = tsk->umcg_task;
+	struct page *page = NULL;
+	u32 state;
+	int ret;
+
+	for (;;) {
+		set_current_state(TASK_INTERRUPTIBLE);
+
+		ret = -EINTR;
+		if (signal_pending(current))
+			break;
+
+		/*
+		 * Faults can block and scribble our wait state.
+		 */
+		pagefault_disable();
+		if (get_user(state, &self->state)) {
+			pagefault_enable();
+
+			ret = -EFAULT;
+			if (page) {
+				unpin_user_page(page);
+				page = NULL;
+				break;
+			}
+
+			if (pin_user_pages_fast((unsigned long)self, 1, 0, &page) != 1) {
+				page = NULL;
+				break;
+			}
+
+			continue;
+		}
+
+		if (page) {
+			unpin_user_page(page);
+			page = NULL;
+		}
+		pagefault_enable();
+
+		state &= UMCG_TASK_MASK;
+		if (state != UMCG_TASK_RUNNABLE) {
+			ret = 0;
+			if (state == UMCG_TASK_RUNNING)
+				break;
+
+			ret = -EINVAL;
+			break;
+		}
+
+		if (!schedule_hrtimeout_range_clock(timo ? &timo : NULL,
+						    tsk->timer_slack_ns,
+						    HRTIMER_MODE_ABS,
+						    tsk->umcg_clock)) {
+			ret = -ETIMEDOUT;
+			break;
+		}
+	}
+	__set_current_state(TASK_RUNNING);
+
+	return ret;
+}
+
+void umcg_sys_exit(struct pt_regs *regs)
+{
+	struct task_struct *tsk = current;
+	struct umcg_task __user *self = READ_ONCE(tsk->umcg_task);
+	long syscall = syscall_get_nr(tsk, regs);
+
+	if (syscall == __NR_umcg_wait)
+		return;
+
+	/*
+	 * sys_umcg_ctl() will get here without having called umcg_sys_enter()
+	 * as such it will look like a syscall that blocked.
+	 */
+
+	if (tsk->umcg_server) {
+		/*
+		 * Didn't block, we done.
+		 */
+		umcg_unpin_pages();
+		return;
+	}
+
+	/* avoid recursion vs schedule() */
+	current->flags &= ~PF_UMCG_WORKER;
+
+	if (umcg_pin_pages())
+		UMCG_DIE("pin");
+
+	if (umcg_update_state(tsk, self, UMCG_TASK_BLOCKED, UMCG_TASK_RUNNABLE))
+		UMCG_DIE_UNPIN("state");
+
+	if (umcg_enqueue_runnable(tsk))
+		UMCG_DIE_UNPIN("enqueue");
+
+	/* Server might not be RUNNABLE, means it's already running */
+	if (umcg_wake_server(tsk))
+		UMCG_DIE_UNPIN("wake-server");
+
+	umcg_unpin_pages();
+
+	switch (umcg_wait(0)) {
+	case -EFAULT:
+	case -EINVAL:
+	case -ETIMEDOUT: /* how!?! */
+	default:
+		UMCG_DIE("wait");
+
+	case 0:
+	case -EINTR:
+		/* notify_resume will continue the wait after the signal */
+		break;
+	}
+
+	current->flags |= PF_UMCG_WORKER;
+}
+
+void umcg_notify_resume(struct pt_regs *regs)
+{
+	struct task_struct *tsk = current;
+	struct umcg_task __user *self = tsk->umcg_task;
+	bool worker = tsk->flags & PF_UMCG_WORKER;
+	u32 state;
+
+	/* avoid recursion vs schedule() */
+	if (worker)
+		current->flags &= ~PF_UMCG_WORKER;
+
+	if (get_user(state, &self->state))
+		UMCG_DIE("get-state");
+
+	state &= UMCG_TASK_MASK | UMCG_TF_MASK;
+	if (state == UMCG_TASK_RUNNING)
+		goto done;
+
+	/*
+	 * See comment at UMCG_TF_COND_WAIT; TL;DR: user *will* call
+	 * sys_umcg_wait() and signals/interrupts shouldn't block
+	 * return-to-user.
+	 */
+	if (state == (UMCG_TASK_RUNNABLE | UMCG_TF_COND_WAIT))
+		goto done;
+
+	if (state & UMCG_TF_PREEMPT) {
+		if (umcg_pin_pages())
+			UMCG_DIE("pin");
+
+		if (umcg_update_state(tsk, self,
+				      UMCG_TASK_RUNNING,
+				      UMCG_TASK_RUNNABLE))
+			UMCG_DIE_UNPIN("state");
+
+		if (umcg_enqueue_runnable(tsk))
+			UMCG_DIE_UNPIN("enqueue");
+
+		/*
+		 * XXX do we want a preemption consuming ::next_tid ?
+		 * I'm currently leaning towards no.
+		 */
+		if (umcg_wake_server(tsk))
+			UMCG_DIE_UNPIN("wake-server");
+
+		umcg_unpin_pages();
+	}
+
+	switch (umcg_wait(0)) {
+	case -EFAULT:
+	case -EINVAL:
+	case -ETIMEDOUT: /* how!?! */
+	default:
+		UMCG_DIE("wait");
+
+	case 0:
+	case -EINTR:
+		/* we'll will resume the wait after the signal */
+		break;
+	}
+
+done:
+	if (worker)
+		current->flags |= PF_UMCG_WORKER;
+}
+
+/**
+ * sys_umcg_kick: makes a UMCG task cycle through umcg_notify_resume()
+ *
+ * Returns:
+ * 0		- Ok;
+ * -ESRCH	- not a related UMCG task
+ * -EINVAL	- another error happened (unknown flags, etc..)
+ */
+SYSCALL_DEFINE2(umcg_kick, u32, flags, pid_t, tid)
+{
+	struct task_struct *task = umcg_get_task(tid);
+	if (!task)
+		return -ESRCH;
+
+	if (flags)
+		return -EINVAL;
+
+#ifdef CONFIG_SMP
+	smp_send_reschedule(task_cpu(task));
+#endif
+
+	return 0;
+}
+
+/**
+ * sys_umcg_wait: transfer running context
+ *
+ * Block until RUNNING. Userspace must already set RUNNABLE to deal with the
+ * sleep condition races (see TF_COND_WAIT).
+ *
+ * Will wake either ::next_tid or ::server_tid to take our place. If this is a
+ * server then not setting ::next_tid will wake self.
+ *
+ * Returns:
+ * 0		- OK;
+ * -ETIMEDOUT	- the timeout expired;
+ * -ERANGE	- the timeout is out of range (worker);
+ * -EAGAIN	- ::state wasn't RUNNABLE, concurrent wakeup;
+ * -EFAULT	- failed accessing struct umcg_task __user of the current
+ *		  task, the server or next;
+ * -ESRCH	- the task to wake not found or not a UMCG task;
+ * -EINVAL	- another error happened (e.g. the current task is not a
+ *		  UMCG task, etc.)
+ */
+SYSCALL_DEFINE2(umcg_wait, u32, flags, u64, timo)
+{
+	struct task_struct *tsk = current;
+	struct umcg_task __user *self = READ_ONCE(tsk->umcg_task);
+	bool worker = tsk->flags & PF_UMCG_WORKER;
+	int ret;
+
+	if (!self || flags)
+		return -EINVAL;
+
+	if (worker) {
+		tsk->flags &= ~PF_UMCG_WORKER;
+		if (timo)
+			return -ERANGE;
+	}
+
+	/* see umcg_sys_{enter,exit}() syscall exceptions */
+	ret = umcg_pin_pages();
+	if (ret)
+		goto unblock;
+
+	/*
+	 * Clear UMCG_TF_COND_WAIT *and* check state == RUNNABLE.
+	 */
+	ret = umcg_update_state(tsk, self, UMCG_TASK_RUNNABLE, UMCG_TASK_RUNNABLE);
+	if (ret)
+		goto unpin;
+
+	if (worker) {
+		ret = umcg_enqueue_runnable(tsk);
+		if (ret)
+			goto unpin;
+	}
+
+	if (worker)
+		ret = umcg_wake(tsk);
+	else if (tsk->umcg_next)
+		ret = umcg_wake_next(tsk);
+
+	if (ret) {
+		/*
+		 * XXX already enqueued ourself on ::server_tid; failing now
+		 * leaves the lot in an inconsistent state since it'll also
+		 * unblock self in order to return the error. !?!?
+		 */
+		goto unpin;
+	}
+
+	umcg_unpin_pages();
+
+	ret = umcg_wait(timo);
+	switch (ret) {
+	case 0:		/* all done */
+	case -EINTR:	/* umcg_notify_resume() will continue the wait */
+		ret = 0;
+		break;
+
+	default:
+		goto unblock;
+	}
+out:
+	if (worker)
+		tsk->flags |= PF_UMCG_WORKER;
+	return ret;
+
+unpin:
+	umcg_unpin_pages();
+unblock:
+	/*
+	 * Workers will still block in umcg_notify_resume() before they can
+	 * consume their error, servers however need to get the error asap.
+	 *
+	 * Still, things might be unrecoverably screwy after this. Not our
+	 * problem.
+	 */
+	if (!worker)
+		umcg_update_state(tsk, self, UMCG_TASK_RUNNABLE, UMCG_TASK_RUNNING);
+	goto out;
+}
+
+/**
+ * sys_umcg_ctl: (un)register the current task as a UMCG task.
+ * @flags:       ORed values from enum umcg_ctl_flag; see below;
+ * @self:        a pointer to struct umcg_task that describes this
+ *               task and governs the behavior of sys_umcg_wait if
+ *               registering; must be NULL if unregistering.
+ *
+ * @flags & UMCG_CTL_REGISTER: register a UMCG task:
+ *
+ *         UMCG workers:
+ *              - @flags & UMCG_CTL_WORKER
+ *              - self->state must be UMCG_TASK_BLOCKED
+ *
+ *         UMCG servers:
+ *              - !(@flags & UMCG_CTL_WORKER)
+ *              - self->state must be UMCG_TASK_RUNNING
+ *
+ *         All tasks:
+ *              - self->server_tid must be a valid server
+ *              - self->next_tid must be zero
+ *
+ *         If the conditions above are met, sys_umcg_ctl() immediately returns
+ *         if the registered task is a server. If the registered task is a
+ *         worker it will be added to it's server's runnable_workers_ptr list
+ *         and the server will be woken.
+ *
+ * @flags == UMCG_CTL_UNREGISTER: unregister a UMCG task. If the current task
+ *           is a UMCG worker, the userspace is responsible for waking its
+ *           server (before or after calling sys_umcg_ctl).
+ *
+ * Return:
+ * 0		- success
+ * -EFAULT	- failed to read @self
+ * -EINVAL	- some other error occurred
+ * -ESRCH	- no such server_tid
+ */
+SYSCALL_DEFINE3(umcg_ctl, u32, flags, struct umcg_task __user *, self, clockid_t, which_clock)
+{
+	struct task_struct *server;
+	struct umcg_task ut;
+
+	if ((unsigned long)self % UMCG_TASK_ALIGN)
+		return -EINVAL;
+
+	if (flags & ~(UMCG_CTL_REGISTER |
+		      UMCG_CTL_UNREGISTER |
+		      UMCG_CTL_WORKER))
+		return -EINVAL;
+
+	if (flags == UMCG_CTL_UNREGISTER) {
+		if (self || !current->umcg_task)
+			return -EINVAL;
+
+		if (current->flags & PF_UMCG_WORKER)
+			umcg_worker_exit();
+		else
+			umcg_clear_task(current);
+
+		return 0;
+	}
+
+	if (!(flags & UMCG_CTL_REGISTER))
+		return -EINVAL;
+
+	flags &= ~UMCG_CTL_REGISTER;
+
+	switch (which_clock) {
+	case CLOCK_REALTIME:
+	case CLOCK_MONOTONIC:
+	case CLOCK_BOOTTIME:
+	case CLOCK_TAI:
+		current->umcg_clock = which_clock;
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	if (current->umcg_task || !self)
+		return -EINVAL;
+
+	if (copy_from_user(&ut, self, sizeof(ut)))
+		return -EFAULT;
+
+	if (ut.next_tid || ut.__hole[0] || ut.__zero[0] || ut.__zero[1] || ut.__zero[2])
+		return -EINVAL;
+
+	rcu_read_lock();
+	server = find_task_by_vpid(ut.server_tid);
+	if (server && server->mm == current->mm) {
+		if (flags == UMCG_CTL_WORKER) {
+			if (!server->umcg_task ||
+			    (server->flags & PF_UMCG_WORKER))
+				server = NULL;
+		} else {
+			if (server != current)
+				server = NULL;
+		}
+	} else {
+		server = NULL;
+	}
+	rcu_read_unlock();
+
+	if (!server)
+		return -ESRCH;
+
+	if (flags == UMCG_CTL_WORKER) {
+		if ((ut.state & (UMCG_TASK_MASK | UMCG_TF_MASK)) != UMCG_TASK_BLOCKED)
+			return -EINVAL;
+
+		WRITE_ONCE(current->umcg_task, self);
+		current->flags |= PF_UMCG_WORKER;	/* hook schedule() */
+		set_syscall_work(SYSCALL_UMCG);		/* hook syscall */
+		set_thread_flag(TIF_UMCG);		/* hook return-to-user */
+
+		/* umcg_sys_exit() will transition to RUNNABLE and wait */
+
+	} else {
+		if ((ut.state & (UMCG_TASK_MASK | UMCG_TF_MASK)) != UMCG_TASK_RUNNING)
+			return -EINVAL;
+
+		WRITE_ONCE(current->umcg_task, self);
+		set_thread_flag(TIF_UMCG);		/* hook return-to-user */
+
+		/* umcg_notify_resume() would block if not RUNNING */
+	}
+
+	return 0;
+}
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -273,6 +273,11 @@  COND_SYSCALL(landlock_create_ruleset);
 COND_SYSCALL(landlock_add_rule);
 COND_SYSCALL(landlock_restrict_self);
 
+/* kernel/sched/umcg.c */
+COND_SYSCALL(umcg_ctl);
+COND_SYSCALL(umcg_wait);
+COND_SYSCALL(umcg_kick);
+
 /* arch/example/kernel/sys_example.c */
 
 /* mm/fadvise.c */