diff mbox series

[RFC,11/13] x86/uintr: Introduce uintr_wait() syscall

Message ID 20210913200132.3396598-12-sohil.mehta@intel.com (mailing list archive)
State New
Headers show
Series x86 User Interrupts support | expand

Commit Message

Sohil Mehta Sept. 13, 2021, 8:01 p.m. UTC
Add a new system call to allow applications to block in the kernel and
wait for user interrupts.

<The current implementation doesn't support waking up from other
blocking system calls like sleep(), read(), epoll(), etc.

uintr_wait() is a placeholder syscall while we decide on that
behaviour.>

When the application makes this syscall the notification vector is
switched to a new kernel vector. Any new SENDUIPI will invoke the kernel
interrupt which is then used to wake up the process.

Currently, the task wait list is global one. To make the implementation
scalable there is a need to move to a distributed per-cpu wait list.

Signed-off-by: Sohil Mehta <sohil.mehta@intel.com>
---
 arch/x86/include/asm/hardirq.h     |  1 +
 arch/x86/include/asm/idtentry.h    |  1 +
 arch/x86/include/asm/irq_vectors.h |  3 +-
 arch/x86/include/asm/uintr.h       | 22 +++++++
 arch/x86/kernel/idt.c              |  1 +
 arch/x86/kernel/irq.c              | 18 ++++++
 arch/x86/kernel/uintr_core.c       | 94 ++++++++++++++++++++++++------
 arch/x86/kernel/uintr_fd.c         | 15 +++++
 8 files changed, 136 insertions(+), 19 deletions(-)

Comments

Thomas Gleixner Sept. 24, 2021, 11:04 a.m. UTC | #1
On Mon, Sep 13 2021 at 13:01, Sohil Mehta wrote:
> Add a new system call to allow applications to block in the kernel and
> wait for user interrupts.
>
> <The current implementation doesn't support waking up from other
> blocking system calls like sleep(), read(), epoll(), etc.
>
> uintr_wait() is a placeholder syscall while we decide on that
> behaviour.>
>
> When the application makes this syscall the notification vector is
> switched to a new kernel vector. Any new SENDUIPI will invoke the kernel
> interrupt which is then used to wake up the process.
>
> Currently, the task wait list is global one. To make the implementation
> scalable there is a need to move to a distributed per-cpu wait list.

How are per cpu wait lists going to solve the problem?

> +
> +/*
> + * Handler for UINTR_KERNEL_VECTOR.
> + */
> +DEFINE_IDTENTRY_SYSVEC(sysvec_uintr_kernel_notification)
> +{
> +	/* TODO: Add entry-exit tracepoints */
> +	ack_APIC_irq();
> +	inc_irq_stat(uintr_kernel_notifications);
> +
> +	uintr_wake_up_process();

So this interrupt happens for any of those notifications. How are they
differentiated? 
>  
> +int uintr_receiver_wait(void)
> +{
> +	struct uintr_upid_ctx *upid_ctx;
> +	unsigned long flags;
> +
> +	if (!is_uintr_receiver(current))
> +		return -EOPNOTSUPP;
> +
> +	upid_ctx = current->thread.ui_recv->upid_ctx;
> +	upid_ctx->upid->nc.nv = UINTR_KERNEL_VECTOR;
> +	upid_ctx->waiting = true;
> +	spin_lock_irqsave(&uintr_wait_lock, flags);
> +	list_add(&upid_ctx->node, &uintr_wait_list);
> +	spin_unlock_irqrestore(&uintr_wait_lock, flags);
> +
> +	set_current_state(TASK_INTERRUPTIBLE);

Because we have not enough properly implemented wait primitives you need
to open code one which is blantantly wrong vs. a concurrent wake up?

> +	schedule();

How is that correct vs. a spurious wakeup? What takes care that the
entry is removed from the list?

Again. We have proper wait primitives.

> +	return -EINTR;
> +}
> +
> +/*
> + * Runs in interrupt context.
> + * Scan through all UPIDs to check if any interrupt is on going.
> + */
> +void uintr_wake_up_process(void)
> +{
> +	struct uintr_upid_ctx *upid_ctx, *tmp;
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&uintr_wait_lock, flags);
> +	list_for_each_entry_safe(upid_ctx, tmp, &uintr_wait_list, node) {
> +		if (test_bit(UPID_ON, (unsigned long*)&upid_ctx->upid->nc.status)) {
> +			set_bit(UPID_SN, (unsigned long *)&upid_ctx->upid->nc.status);
> +			upid_ctx->upid->nc.nv = UINTR_NOTIFICATION_VECTOR;
> +			upid_ctx->waiting = false;
> +			wake_up_process(upid_ctx->task);
> +			list_del(&upid_ctx->node);

So any of these notification interrupts does a global mass wake up? How
does that make sense?

> +		}
> +	}
> +	spin_unlock_irqrestore(&uintr_wait_lock, flags);
> +}
> +
> +/* Called when task is unregistering/exiting */
> +static void uintr_remove_task_wait(struct task_struct *task)
> +{
> +	struct uintr_upid_ctx *upid_ctx, *tmp;
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&uintr_wait_lock, flags);
> +	list_for_each_entry_safe(upid_ctx, tmp, &uintr_wait_list, node) {
> +		if (upid_ctx->task == task) {
> +			pr_debug("wait: Removing task %d from wait\n",
> +				 upid_ctx->task->pid);
> +			upid_ctx->upid->nc.nv = UINTR_NOTIFICATION_VECTOR;
> +			upid_ctx->waiting = false;
> +			list_del(&upid_ctx->node);
> +		}

What? You have to do a global list walk to find the entry which you
added yourself?

Thanks,

        tglx
Thomas Gleixner Sept. 25, 2021, 12:08 p.m. UTC | #2
On Fri, Sep 24 2021 at 13:04, Thomas Gleixner wrote:
> On Mon, Sep 13 2021 at 13:01, Sohil Mehta wrote:
>> +int uintr_receiver_wait(void)
>> +{
>> +	struct uintr_upid_ctx *upid_ctx;
>> +	unsigned long flags;
>> +
>> +	if (!is_uintr_receiver(current))
>> +		return -EOPNOTSUPP;
>> +
>> +	upid_ctx = current->thread.ui_recv->upid_ctx;
>> +	upid_ctx->upid->nc.nv = UINTR_KERNEL_VECTOR;
>> +	upid_ctx->waiting = true;
>> +	spin_lock_irqsave(&uintr_wait_lock, flags);
>> +	list_add(&upid_ctx->node, &uintr_wait_list);
>> +	spin_unlock_irqrestore(&uintr_wait_lock, flags);
>> +
>> +	set_current_state(TASK_INTERRUPTIBLE);
>
> Because we have not enough properly implemented wait primitives you need
> to open code one which is blantantly wrong vs. a concurrent wake up?
>
>> +	schedule();
>
> How is that correct vs. a spurious wakeup? What takes care that the
> entry is removed from the list?
>
> Again. We have proper wait primitives.

Aisde of that this is completely broken vs. CPU hotplug.

CPUX
  switchto(tsk)
    tsk->upid.ndst = apicid(smp_processor_id();

  ret_to_user()
  ...
  sys_uintr_wait()
    ...
    schedule()

After that CPU X is unplugged which means the task won't be woken up by
an user IPI which is issued after CPU X went down.

Thanks,

        tglx
Thomas Gleixner Sept. 26, 2021, 2:41 p.m. UTC | #3
On Mon, Sep 13 2021 at 13:01, Sohil Mehta wrote:
> Add a new system call to allow applications to block in the kernel and
> wait for user interrupts.
>
> <The current implementation doesn't support waking up from other
> blocking system calls like sleep(), read(), epoll(), etc.
>
> uintr_wait() is a placeholder syscall while we decide on that
> behaviour.>

Which behaviour? You cannot integrate this into [clock_]nanosleep() by
any means or wakeup something which is blocked in read(somefd) via
SENDUIPI.

What you can do is implement read() and poll() support for the
uintrfd. Anything else is just not going to fly.

Adding support for read/poll is pretty much a straight forward variant
of a correctly implemented wait()/wakeup() mechanism.

While poll()/read() support might be useful and poll() also provides a
timeout, having an explicit (timed) wait mechanism might be interesting.

But that brings me to an interesting question. There are two cases:

 1) The task installed a user space interrupt handler. Now it
    want's to play nice and yield the CPU while waiting.

    So it needs to reinstall the UINV vector on return to user and
    update UIRR, but that'd be covered by the existing mechanism. Fine.

 2) Task has no user space interrupt handler installed and just want's
    to use that wait mechanism.

    What is consuming the pending bit(s)? 

    If that's not a valid use case, then the wait has to check for that
    and reject the syscall with EINVAL.

    If it is valid, then how are the pending bits consumed and relayed to
    user space?

The same questions arise when you think about implementing poll/read
support simply because the regular poll/read semantics are:

  poll waits for the event and read consumes the event

which would be similar to #2 above, but with an installed user space
interrupt handler the return from the poll system call would consume the
event immediately (assumed that UIF is set).

Thanks,

        tglx
Sohil Mehta Sept. 28, 2021, 11:08 p.m. UTC | #4
On 9/24/2021 4:04 AM, Thomas Gleixner wrote:
> On Mon, Sep 13 2021 at 13:01, Sohil Mehta wrote:
>> Currently, the task wait list is global one. To make the implementation
>> scalable there is a need to move to a distributed per-cpu wait list.
> How are per cpu wait lists going to solve the problem?


Currently, the global wait list can be concurrently accessed by multiple 
cpus. If we have per-cpu wait lists then the UPID scanning only needs to 
happen on the local cpu's wait list.

After an application calls uintr_wait(), the notification interrupt will 
be delivered only to the cpu where the task blocked. In this case, we 
can reduce the UPID search list and probably get rid of the global 
spinlock as well.

Though, I am not sure how much impact this would have vs. the problem of 
scanning the entire wait list.

>> +
>> +/*
>> + * Handler for UINTR_KERNEL_VECTOR.
>> + */
>> +DEFINE_IDTENTRY_SYSVEC(sysvec_uintr_kernel_notification)
>> +{
>> +	/* TODO: Add entry-exit tracepoints */
>> +	ack_APIC_irq();
>> +	inc_irq_stat(uintr_kernel_notifications);
>> +
>> +	uintr_wake_up_process();
> So this interrupt happens for any of those notifications. How are they
> differentiated?


Unfortunately, there is no help from the hardware here to identify the 
intended target.

When a task blocks we:
* switch the UINV to a kernel NV.
* leave SN as 0
* leave UPID.NDST to the current cpu
* add the task to a wait list

When the notification interrupt arrives:
* Scan the entire wait list to check if the ON bit is set for any UPID 
(very inefficient)
* Set SN to 1 for that task.
* Change the UINV to user NV.
* Remove the task from the list and make it runnable.

We could end up detecting multiple tasks that have the ON bit set. The 
notification interrupt for any task that has ON set is expected to 
arrive soon anyway. So no harm done here.

The main issue here is we would end up scanning the entire list for 
every interrupt. Not sure if there any way we could optimize this?


> Again. We have proper wait primitives.

I'll use proper wait primitives next time.
>> +	return -EINTR;
>> +}
>> +
>> +/*
>> + * Runs in interrupt context.
>> + * Scan through all UPIDs to check if any interrupt is on going.
>> + */
>> +void uintr_wake_up_process(void)
>> +{
>> +	struct uintr_upid_ctx *upid_ctx, *tmp;
>> +	unsigned long flags;
>> +
>> +	spin_lock_irqsave(&uintr_wait_lock, flags);
>> +	list_for_each_entry_safe(upid_ctx, tmp, &uintr_wait_list, node) {
>> +		if (test_bit(UPID_ON, (unsigned long*)&upid_ctx->upid->nc.status)) {
>> +			set_bit(UPID_SN, (unsigned long *)&upid_ctx->upid->nc.status);
>> +			upid_ctx->upid->nc.nv = UINTR_NOTIFICATION_VECTOR;
>> +			upid_ctx->waiting = false;
>> +			wake_up_process(upid_ctx->task);
>> +			list_del(&upid_ctx->node);
> So any of these notification interrupts does a global mass wake up? How
> does that make sense?


The wake up happens only for the tasks that have a pending interrupt. 
They are going to be woken up soon anyways.

>> +/* Called when task is unregistering/exiting */
>> +static void uintr_remove_task_wait(struct task_struct *task)
>> +{
>> +	struct uintr_upid_ctx *upid_ctx, *tmp;
>> +	unsigned long flags;
>> +
>> +	spin_lock_irqsave(&uintr_wait_lock, flags);
>> +	list_for_each_entry_safe(upid_ctx, tmp, &uintr_wait_list, node) {
>> +		if (upid_ctx->task == task) {
>> +			pr_debug("wait: Removing task %d from wait\n",
>> +				 upid_ctx->task->pid);
>> +			upid_ctx->upid->nc.nv = UINTR_NOTIFICATION_VECTOR;
>> +			upid_ctx->waiting = false;
>> +			list_del(&upid_ctx->node);
>> +		}
> What? You have to do a global list walk to find the entry which you
> added yourself?

Duh! I could have gotten the upid_ctx from the task_struct itself. Will 
fix this.

Thanks,

Sohil
Sohil Mehta Sept. 28, 2021, 11:13 p.m. UTC | #5
On 9/25/2021 5:08 AM, Thomas Gleixner wrote:
> Aisde of that this is completely broken vs. CPU hotplug.
>
Thank you for pointing this out. I hadn't even considered CPU hotplug.

Thanks,
Sohil
Sohil Mehta Sept. 29, 2021, 1:09 a.m. UTC | #6
On 9/26/2021 7:41 AM, Thomas Gleixner wrote:
> On Mon, Sep 13 2021 at 13:01, Sohil Mehta wrote:
>> Add a new system call to allow applications to block in the kernel and
>> wait for user interrupts.
>>
>> <The current implementation doesn't support waking up from other
>> blocking system calls like sleep(), read(), epoll(), etc.
>>
>> uintr_wait() is a placeholder syscall while we decide on that
>> behaviour.>
> Which behaviour? You cannot integrate this into [clock_]nanosleep() by
> any means or wakeup something which is blocked in read(somefd) via
> SENDUIPI.

That is the (wishful) desire.

The idea is to have a behavior similar to signals for all or a subset of 
system calls. i.e. return an EINTR by interrupting the blocked syscall 
and possibly have a SA_RESTART type of mechanism.

Can we use the existing signal infrastructure to generate a temporary 
in-kernel signal upon detection of an pending user interrupt? The 
temporary signal doesn't need to be delivered to application but it 
would just be a mechanism to interrupt the blocked syscall.

I don't know anything about the signaling subsystem nor have I tried 
prototyping this. So, all this might be completely baseless.


> What you can do is implement read() and poll() support for the
> uintrfd. Anything else is just not going to fly.
>
> Adding support for read/poll is pretty much a straight forward variant
> of a correctly implemented wait()/wakeup() mechanism.

I tried doing this but I ran into a couple of issues.

1) uintrfd is mapped to a single vector (out of 64). But there is no 
easy hardware mechanism to wait for specific vectors. Waiting for one 
vector might mean waiting for all.

2) The scope of uintrfd is process wide. Also, it would be shared with 
senders. But the wait/wake mechanism is specific to the task that 
created the fd and has a UPID allocated.
As you mentioned below, relaying the pending interrupt information of 
another task would be very tricky.


> While poll()/read() support might be useful and poll() also provides a
> timeout, having an explicit (timed) wait mechanism might be interesting.

I prototyped uintr_wait() with the same intention to have an explicit 
timed yield mechanism. There is very little ambiguity about who is 
waiting for what and how we would deliver the interrupts.


> But that brings me to an interesting question. There are two cases:
>
>   1) The task installed a user space interrupt handler. Now it
>      want's to play nice and yield the CPU while waiting.
>
>      So it needs to reinstall the UINV vector on return to user and
>      update UIRR, but that'd be covered by the existing mechanism. Fine.
>
>   2) Task has no user space interrupt handler installed and just want's
>      to use that wait mechanism.
>
>      What is consuming the pending bit(s)?
>
>      If that's not a valid use case, then the wait has to check for that
>      and reject the syscall with EINVAL.

Yeah. I feel this is not a valid use case. But I am no application 
developer. I will try to seek more opinions here.


>      If it is valid, then how are the pending bits consumed and relayed to
>      user space?

This is very tricky. Because a task that owns the UPID might be 
consuming interrupts while the kernel tries to relay the pending 
interrupt information to another task.


> The same questions arise when you think about implementing poll/read
> support simply because the regular poll/read semantics are:
>
>    poll waits for the event and read consumes the event
> which would be similar to #2 above, but with an installed user space
> interrupt handler the return from the poll system call would consume the
> event immediately (assumed that UIF is set).
>

Yup. There is no read data associated with uintrfd. This might be 
confusing for the application.

Overall, I feel signal handler semantics fit better with User interrupts 
handlers. But as you mentioned there might be no easy way to achieve that.

Thanks again for providing your input on this.

Sohil
Andy Lutomirski Sept. 29, 2021, 3:30 a.m. UTC | #7
On Mon, Sep 13, 2021, at 1:01 PM, Sohil Mehta wrote:
> Add a new system call to allow applications to block in the kernel and
> wait for user interrupts.
>

...

>
> When the application makes this syscall the notification vector is
> switched to a new kernel vector. Any new SENDUIPI will invoke the kernel
> interrupt which is then used to wake up the process.

Any new SENDUIPI that happens to hit the target CPU's ucode at a time when the kernel vector is enabled will deliver the interrupt.  Any new SENDUIPI that happens to hit the target CPU's ucode at a time when a different UIPI-using task is running will *not* deliver the interrupt, unless I'm missing some magic.  Which means that wakeups will be missed, which I think makes this whole idea a nonstarter.

Am I missing something?
Sohil Mehta Sept. 29, 2021, 4:56 a.m. UTC | #8
On 9/28/2021 8:30 PM, Andy Lutomirski wrote:
> On Mon, Sep 13, 2021, at 1:01 PM, Sohil Mehta wrote:
>> Add a new system call to allow applications to block in the kernel and
>> wait for user interrupts.
>>
> ...
>
>> When the application makes this syscall the notification vector is
>> switched to a new kernel vector. Any new SENDUIPI will invoke the kernel
>> interrupt which is then used to wake up the process.
> Any new SENDUIPI that happens to hit the target CPU's ucode at a time when the kernel vector is enabled will deliver the interrupt.  Any new SENDUIPI that happens to hit the target CPU's ucode at a time when a different UIPI-using task is running will *not* deliver the interrupt, unless I'm missing some magic.  Which means that wakeups will be missed, which I think makes this whole idea a nonstarter.
>
> Am I missing something?


The current kernel implementation reserves 2 notification vectors (NV) 
for the 2 states of a thread (running vs blocked).

NV-1 – used only for tasks that are running. (results in a user 
interrupt or a spurious kernel interrupt)

NV-2 – used only for a tasks that are blocked in the kernel. (always 
results in a kernel interrupt)

The UPID.UINV bits are switched between NV-1 and NV-2 based on the state 
of the task.

However, NV-1 is also programmed in the running task's MISC_MSR UINV 
bits. This is what tells the ucode that the notification vector received 
is for the user instead of the kernel.

NV-2 is never programmed in the MISC_MSR of a task. When NV-2 arrives on 
any cpu there is never a possibility of it being detected as a User 
Interrupt. It will always be delivered to the kernel.

Does this help clarify the above?


I just realized, we need to be careful when the notification vectors are 
switched in the UPID. Any pending vectors detected after the switch 
should abort the blocking call. The current code is wrong in a lot of 
places where it touches the UPID.

Thanks,
Sohil
Andy Lutomirski Sept. 30, 2021, 6:08 p.m. UTC | #9
On Tue, Sep 28, 2021, at 9:56 PM, Sohil Mehta wrote:
> On 9/28/2021 8:30 PM, Andy Lutomirski wrote:
>> On Mon, Sep 13, 2021, at 1:01 PM, Sohil Mehta wrote:
>>> Add a new system call to allow applications to block in the kernel and
>>> wait for user interrupts.
>>>
>> ...
>>
>>> When the application makes this syscall the notification vector is
>>> switched to a new kernel vector. Any new SENDUIPI will invoke the kernel
>>> interrupt which is then used to wake up the process.
>> Any new SENDUIPI that happens to hit the target CPU's ucode at a time when the kernel vector is enabled will deliver the interrupt.  Any new SENDUIPI that happens to hit the target CPU's ucode at a time when a different UIPI-using task is running will *not* deliver the interrupt, unless I'm missing some magic.  Which means that wakeups will be missed, which I think makes this whole idea a nonstarter.
>>
>> Am I missing something?
>
>
> The current kernel implementation reserves 2 notification vectors (NV) 
> for the 2 states of a thread (running vs blocked).
>
> NV-1 – used only for tasks that are running. (results in a user 
> interrupt or a spurious kernel interrupt)
>
> NV-2 – used only for a tasks that are blocked in the kernel. (always 
> results in a kernel interrupt)
>
> The UPID.UINV bits are switched between NV-1 and NV-2 based on the state 
> of the task.

Aha, cute.  So NV-1 is only sent if the target is directly paying attention and, assuming all the atomics are done right, NV-2 will be sent for tasks that are asleep.

Logically, I think these are the possible states for a receiving task:

1. Running.  SENDUIPI will actually deliver the event directly (or not if uintr is masked).  If the task just stopped running and the atomics are right, then the schedule-out code can, I think, notice.

2. Not running, but either runnable or not currently waiting for uintr (e.g. blocked in an unrelated syscall).  This is straightforward -- no IPI or other action is needed other than setting the uintr-pending bit.

3. Blocked and waiting for uintr.  For this to work right, anyone trying to send with SENDUIPI (or maybe a vdso or similar clever wrapper around it) needs to result in either a fault or an IPI so the kernel can process the wakeup.

(Note that, depending on how fancy we get with file descriptors and polling, we need to watch out for the running-and-also-waiting-for-kernel-notification state.  That one will never work right.)

3 is the nasty case, and your patch makes it work with this NV-2 trick.  The trick is a bit gross for a couple reasons.  First, it conveys no useful information to the kernel except that an unknown task did SENDUIPI and maybe that the target was most recently on a given CPU.  So a big list search is needed.  Also, it hits an essentially arbitrary and possibly completely innocent victim CPU and task, and people doing any sort of task isolation workload will strongly dislike this.  For some of those users, "strongly" may mean "treat system as completely failed, fail over to something else and call expensive tech support."  So we can't do that.

I think we have three choices:

Use a fancy wrapper around SENDUIPI.  This is probably a bad idea.

Treat the NV-2 as a real interrupt and honor affinity settings.  This will be annoying and slow, I think, if it's even workable at all.

Handle this case with faults instead of interrupts.  We could set a reserved bit in UPID so that SENDUIPI results in #GP, decode it, and process it.  This puts the onus on the actual task causing trouble, which is nice, and it lets us find the UPID and target directly instead of walking all of them.  I don't know how well it would play with hypothetical future hardware-initiated uintrs, though.
Thomas Gleixner Sept. 30, 2021, 7:29 p.m. UTC | #10
On Thu, Sep 30 2021 at 11:08, Andy Lutomirski wrote:
> On Tue, Sep 28, 2021, at 9:56 PM, Sohil Mehta wrote:
> I think we have three choices:
>
> Use a fancy wrapper around SENDUIPI.  This is probably a bad idea.
>
> Treat the NV-2 as a real interrupt and honor affinity settings.  This
> will be annoying and slow, I think, if it's even workable at all.

We can make it a real interrupt in form of a per CPU interrupt, but
affinity settings are not really feasible because the affinity is in the
UPID.ndst field. So, yes we can target it to some CPU, but that's racy.

> Handle this case with faults instead of interrupts.  We could set a
> reserved bit in UPID so that SENDUIPI results in #GP, decode it, and
> process it.  This puts the onus on the actual task causing trouble,
> which is nice, and it lets us find the UPID and target directly
> instead of walking all of them.  I don't know how well it would play
> with hypothetical future hardware-initiated uintrs, though.

I thought about that as well and dismissed it due to the hardware
initiated ones but thinking more about it, those need some translation
unit (e.g. irq remapping) anyway, so it might be doable to catch those
as well. So we could just ignore them for now and go for the #GP trick
and deal with the device initiated ones later when they come around :)

But even with that we still need to keep track of the armed ones per CPU
so we can handle CPU hotunplug correctly. Sigh...

Thanks,

        tglx
Andy Lutomirski Sept. 30, 2021, 10:01 p.m. UTC | #11
On Thu, Sep 30, 2021, at 12:29 PM, Thomas Gleixner wrote:
> On Thu, Sep 30 2021 at 11:08, Andy Lutomirski wrote:
>> On Tue, Sep 28, 2021, at 9:56 PM, Sohil Mehta wrote:
>> I think we have three choices:
>>
>> Use a fancy wrapper around SENDUIPI.  This is probably a bad idea.
>>
>> Treat the NV-2 as a real interrupt and honor affinity settings.  This
>> will be annoying and slow, I think, if it's even workable at all.
>
> We can make it a real interrupt in form of a per CPU interrupt, but
> affinity settings are not really feasible because the affinity is in the
> UPID.ndst field. So, yes we can target it to some CPU, but that's racy.
>
>> Handle this case with faults instead of interrupts.  We could set a
>> reserved bit in UPID so that SENDUIPI results in #GP, decode it, and
>> process it.  This puts the onus on the actual task causing trouble,
>> which is nice, and it lets us find the UPID and target directly
>> instead of walking all of them.  I don't know how well it would play
>> with hypothetical future hardware-initiated uintrs, though.
>
> I thought about that as well and dismissed it due to the hardware
> initiated ones but thinking more about it, those need some translation
> unit (e.g. irq remapping) anyway, so it might be doable to catch those
> as well. So we could just ignore them for now and go for the #GP trick
> and deal with the device initiated ones later when they come around :)

Sounds good to me. In the long run, if Intel wants device initiated fancy interrupts to work well, they need a new design.

>
> But even with that we still need to keep track of the armed ones per CPU
> so we can handle CPU hotunplug correctly. Sigh...

I don’t think any real work is needed. We will only ever have armed UPIDs (with notification interrupts enabled) for running tasks, and hot-unplugged CPUs don’t have running tasks.  We do need a way to drain pending IPIs before we offline a CPU, but that’s a separate problem and may be unsolvable for all I know. Is there a magic APIC operation to wait until all initiated IPIs targeting the local CPU arrive?  I guess we can also just mask the notification vector so that it won’t crash us if we get a stale IPI after going offline.

>
> Thanks,
>
>         tglx
Thomas Gleixner Oct. 1, 2021, 12:01 a.m. UTC | #12
On Thu, Sep 30 2021 at 15:01, Andy Lutomirski wrote:
> On Thu, Sep 30, 2021, at 12:29 PM, Thomas Gleixner wrote:
>>
>> But even with that we still need to keep track of the armed ones per CPU
>> so we can handle CPU hotunplug correctly. Sigh...
>
> I don’t think any real work is needed. We will only ever have armed
> UPIDs (with notification interrupts enabled) for running tasks, and
> hot-unplugged CPUs don’t have running tasks.

That's not the problem. The problem is the wait for uintr case where the
task is obviously not running:

CPU 1
     upid = T1->upid;
     upid->vector = UINTR_WAIT_VECTOR;
     upid->ndst = local_apic_id();
     ...
     do {
         ....
         schedule();
     }

CPU 0
    unplug CPU 1

    SENDUPI(index)
        // Hardware does:
        tblentry = &ttable[index];
        upid = tblentry->upid;
        upid->pir |= tblentry->uv;
        send_IPI(upid->vector, upid->ndst);

So SENDUPI will send the IPI to the APIC ID provided by T1->upid.ndst
which points to the offlined CPU 1 and therefore is obviously going to
/dev/null. IOW, lost wakeup...

> We do need a way to drain pending IPIs before we offline a CPU, but
> that’s a separate problem and may be unsolvable for all I know. Is
> there a magic APIC operation to wait until all initiated IPIs
> targeting the local CPU arrive?  I guess we can also just mask the
> notification vector so that it won’t crash us if we get a stale IPI
> after going offline.

All of this is solved already otherwise CPU hot unplug would explode in
your face every time. The software IPI send side is carefully
synchronized vs. hotplug (at least in theory). May I ask you politely to
make yourself familiar with all that before touting "We do need..." based
on random assumptions?

The above SENDUIPI vs. CPU hotplug scenario is the same problem as we
have with regular device interrupts which are targeted at an outgoing
CPU. We have magic mechanisms in place to handle that to the extent
possible, but due to the insanity of X86 interrupt handling mechanics
that still leaves a very tiny hole which might cause a lost and
subsequently stale interrupt. Nothing we can fix in software.

So on CPU offline the hotplug code walks through all device interrupts
and checks whether they are targeted at the outgoing CPU. If so they are
rerouted to an online CPU with lots of care to make the possible race
window as small as it gets. That's nowadays only a problem on systems
where interrupt remapping is not available or disabled via commandline.

For tasks which just have the user interrupt armed there is no problem
because SENDUPI modifies UPID->PIR which is reevaluated when the task
which got migrated to an online CPU is going back to user space.

The uintr_wait() syscall creates the very same problem as we have with
device interrupts. Which means we need to make that wait thing:

     upid = T1->upid;
     upid->vector = UINTR_WAIT_VECTOR;
     upid->ndst = local_apic_id();
     list_add(this_cpu_ptr(pcp_uintrs), upid->pcp_uintr);
     ...
     do {
         ....
         schedule();
     }
     list_del_init(upid->pcp_uintr);

and the hotplug code do:

    for_each_entry_safe(upid, this_cpu_ptr(pcp_uintrs), ...) {
       list_del(upid->pcp_uintr);
       upid->ndst = apic_id_of_random_online_cpu();
       if (do_magic_checks_whether_ipi_is_pending())
         send_ipi(upid->vector, upid->ndst);
    }

See?

We could map that to the interrupt subsystem by creating a virtual
interrupt domain for this, but that would make uintr_wait() look like
this:

     irq = uintr_alloc_irq();
     request_irq(irq, ......);
     upid = T1->upid;
     upid->vector = UINTR_WAIT_VECTOR;
     upid->ndst = local_apic_id();
     list_add(this_cpu_ptr(pcp_uintrs), upid->pcp_uintr);
     ...
     do {
         ....
         schedule();
     }
     list_del_init(upid->pcp_uintr);
     free_irq(irq);

But the benefit of that is dubious as it creates overhead on both sides
of the sleep and the only real purpose of the irq request would be to
handle CPU hotunplug without the above per CPU list mechanics.

Welcome to my wonderful world!

Thanks,

        tglx
Andy Lutomirski Oct. 1, 2021, 4:41 a.m. UTC | #13
On Thu, Sep 30, 2021, at 5:01 PM, Thomas Gleixner wrote:
> On Thu, Sep 30 2021 at 15:01, Andy Lutomirski wrote:
>> On Thu, Sep 30, 2021, at 12:29 PM, Thomas Gleixner wrote:
>>>
>>> But even with that we still need to keep track of the armed ones per CPU
>>> so we can handle CPU hotunplug correctly. Sigh...
>>
>> I don’t think any real work is needed. We will only ever have armed
>> UPIDs (with notification interrupts enabled) for running tasks, and
>> hot-unplugged CPUs don’t have running tasks.
>
> That's not the problem. The problem is the wait for uintr case where the
> task is obviously not running:
>
> CPU 1
>      upid = T1->upid;
>      upid->vector = UINTR_WAIT_VECTOR;
>      upid->ndst = local_apic_id();
>      ...
>      do {
>          ....
>          schedule();
>      }
>
> CPU 0
>     unplug CPU 1
>
>     SENDUPI(index)
>         // Hardware does:
>         tblentry = &ttable[index];
>         upid = tblentry->upid;
>         upid->pir |= tblentry->uv;
>         send_IPI(upid->vector, upid->ndst);
>
> So SENDUPI will send the IPI to the APIC ID provided by T1->upid.ndst
> which points to the offlined CPU 1 and therefore is obviously going to
> /dev/null. IOW, lost wakeup...

Yes, but I don't think this is how we should structure this.

CPU 1
 upid->vector = UINV;
 upid->ndst = local_apic_id()
 exit to usermode;
 return from usermode;
 ...

 schedule();
 fpu__save_crap [see below]:
   if (this task is waiting for a uintr) {
     upid->resv0 = 1;  /* arm #GP */
   } else {
     upid->sn = 1;
   }


>
>> We do need a way to drain pending IPIs before we offline a CPU, but
>> that’s a separate problem and may be unsolvable for all I know. Is
>> there a magic APIC operation to wait until all initiated IPIs
>> targeting the local CPU arrive?  I guess we can also just mask the
>> notification vector so that it won’t crash us if we get a stale IPI
>> after going offline.
>
> All of this is solved already otherwise CPU hot unplug would explode in
> your face every time. The software IPI send side is carefully
> synchronized vs. hotplug (at least in theory). May I ask you politely to
> make yourself familiar with all that before touting "We do need..." based
> on random assumptions?

I'm aware that the software send IPI side is synchronized against hotplug.  But SENDUIPI is not unless we're going to have the CPU offline code IPI every other CPU to make sure that their SENDUIPIs have completed -- we don't control the SENDUIPI code.

After reading the ISE docs again, I think it might be possible to use the ON bit to synchronize.  In the schedule-out path, if we discover that ON = 1, then there is an IPI in flight to us.  In theory, we could wait for it, although actually doing so could be a mess.  That's why I'm asking whether there's a way to tell the APIC to literally wait for all IPIs that are *already sent* to be delivered.

>
> The above SENDUIPI vs. CPU hotplug scenario is the same problem as we
> have with regular device interrupts which are targeted at an outgoing
> CPU. We have magic mechanisms in place to handle that to the extent
> possible, but due to the insanity of X86 interrupt handling mechanics
> that still leaves a very tiny hole which might cause a lost and
> subsequently stale interrupt. Nothing we can fix in software.
>
> So on CPU offline the hotplug code walks through all device interrupts
> and checks whether they are targeted at the outgoing CPU. If so they are
> rerouted to an online CPU with lots of care to make the possible race
> window as small as it gets. That's nowadays only a problem on systems
> where interrupt remapping is not available or disabled via commandline.
>
> For tasks which just have the user interrupt armed there is no problem
> because SENDUPI modifies UPID->PIR which is reevaluated when the task
> which got migrated to an online CPU is going back to user space.
>
> The uintr_wait() syscall creates the very same problem as we have with
> device interrupts. Which means we need to make that wait thing:
>
>      upid = T1->upid;
>      upid->vector = UINTR_WAIT_VECTOR;

This is exactly what I'm suggesting we *don't* do.  Instead we set a reserved bit, we decode SENDUIPI in the #GP handler, and we emulate, in-kernel, the notification process for non-running tasks.

Now that I read the docs some more, I'm seriously concerned about this XSAVE design.  XSAVES with UINTR is destructive -- it clears UINV.  If we actually use this, then the whole last_cpu "preserve the state in registers" optimization goes out the window.  So does anything that happens to assume that merely saving the state doesn't destroy it on respectable modern CPUs  XRSTORS will #GP if you XRSTORS twice, which makes me nervous and would need a serious audit of our XRSTORS paths.

This is gross.

--Andy
Thomas Gleixner Oct. 1, 2021, 9:56 a.m. UTC | #14
On Thu, Sep 30 2021 at 21:41, Andy Lutomirski wrote:
> On Thu, Sep 30, 2021, at 5:01 PM, Thomas Gleixner wrote:
>> All of this is solved already otherwise CPU hot unplug would explode in
>> your face every time. The software IPI send side is carefully
>> synchronized vs. hotplug (at least in theory). May I ask you politely to
>> make yourself familiar with all that before touting "We do need..." based
>> on random assumptions?
>
> I'm aware that the software send IPI side is synchronized against
> hotplug.  But SENDUIPI is not unless we're going to have the CPU
> offline code IPI every other CPU to make sure that their SENDUIPIs
> have completed -- we don't control the SENDUIPI code.

That's correct, but on CPU hot unplug _all_ running tasks have been
migrated to an online CPU _before_ the APIC is turned off. So they all
went through schedule() which set the UPID->SN bit. That's obviously
racy, but that has to be handled in exit to user mode anyway because
that's not different from any other migration or preemption. So that's
_not_ a problem at all.

The problem only exists if we can't do the #GP trick for tasks which are
sitting in uintr_wait(). Because then we _have_ to be careful vs. a
concurrent SENDUPI. But that'd be not any different from the problem
vs. device interrupts which we have today.

If we can use #GP then there is no problem at all and we avoid all the
nasty stuff vs. hotplug and avoid the list walk etc.

> After reading the ISE docs again, I think it might be possible to use
> the ON bit to synchronize.  In the schedule-out path, if we discover
> that ON = 1, then there is an IPI in flight to us.  In theory, we
> could wait for it, although actually doing so could be a mess.  That's
> why I'm asking whether there's a way to tell the APIC to literally
> wait for all IPIs that are *already sent* to be delivered.

You could busy poll with interrupts enabled, but that does not solve
anything. What guarantees that after APIC.IRR is clear no further IPI is
sent? Nothing at all. But again, that's not any different from device
interrupts and we already handle that today:

      cpu down()
      ...
      disable interrupts();
      for_each_interrupt_affine_to_cpu(irq) {
      	change_affinity_to_online_cpu(irq, new_target_cpu);
        // Did device send to the old vector?
        if (APIC.IRR & vector_bit(old_vector))
           send_IPI(new_target_cpu, new_vector);
      }

So for uintr_wait() w/o #GP we'd need to do:

      for_each_waiter_on_cpu(w) {
           move_waiter_to_new_target_cpu_wait_list(w);
           w->ndest = new_target_cpu;
           if (w->ON)
              send_IPI(new_target_cpu, UIWAIT_VECTOR);
      }

>> The uintr_wait() syscall creates the very same problem as we have with
>> device interrupts. Which means we need to make that wait thing:
>>
>>      upid = T1->upid;
>>      upid->vector = UINTR_WAIT_VECTOR;
>
> This is exactly what I'm suggesting we *don't* do.  Instead we set a
> reserved bit, we decode SENDUIPI in the #GP handler, and we emulate,
> in-kernel, the notification process for non-running tasks.

Yes, under the assumption that we can use #GP without breaking device
delivery.

> Now that I read the docs some more, I'm seriously concerned about this
> XSAVE design.  XSAVES with UINTR is destructive -- it clears UINV.  If
> we actually use this, then the whole last_cpu "preserve the state in
> registers" optimization goes out the window.  So does anything that
> happens to assume that merely saving the state doesn't destroy it on
> respectable modern CPUs XRSTORS will #GP if you XRSTORS twice, which
> makes me nervous and would need a serious audit of our XRSTORS paths.

I have no idea what you are fantasizing about. You can XRSTORS five
times in a row as long as your XSTATE memory image is correct.

If you don't want to use XSAVES to manage UINTR then you have to manualy
fiddle with the MSRs and UIF in schedule() and return to user space.

Also keeping UINV alive when scheduling out creates a life time issue
vs. UPID:

CPU 0   CPU 1                   CPU2
        T1 -> schedule         // UPID is live in UINTR MSRs
        do_stuff_in_kernel()
        local_irq_disable();
                                SENDUIPI(T1 -> CPU1)
pull T1
T1 exits
free UPID

        local_irq_enable();
        ucode handles UINV -> UAF

Clearing UINV prevents the ucode from handling the IPI and fiddling with
UPID. The CPU will forward the IPI vector to the kernel which acks it
and does nothing else, i.e. it's a spurious interrupt.

Coming back to state preserving. All what needs to be done for a
situation where the rest of the XSTATE is live in the registers, i.e.
the T -> kthread -> T scheduling scenario, is to restore UINV on exit to
user mode and handle UPID.PIR which might contain newly set bits which
are obviously not yet in UPID.IRR. That can be done by MSR fiddling or
by issuing an self IPI on the UINV vector which will be handled in ucode
on the first user space instruction after return.

When the FPU has to be restored then the state has to be updated in the
XSAVE memory image before doing XRSTORS.

Thanks,

        tglx
Andy Lutomirski Oct. 1, 2021, 3:13 p.m. UTC | #15
On Fri, Oct 1, 2021, at 2:56 AM, Thomas Gleixner wrote:
> On Thu, Sep 30 2021 at 21:41, Andy Lutomirski wrote:
>> On Thu, Sep 30, 2021, at 5:01 PM, Thomas Gleixner wrote:
>
>> Now that I read the docs some more, I'm seriously concerned about this
>> XSAVE design.  XSAVES with UINTR is destructive -- it clears UINV.  If
>> we actually use this, then the whole last_cpu "preserve the state in
>> registers" optimization goes out the window.  So does anything that
>> happens to assume that merely saving the state doesn't destroy it on
>> respectable modern CPUs XRSTORS will #GP if you XRSTORS twice, which
>> makes me nervous and would need a serious audit of our XRSTORS paths.
>
> I have no idea what you are fantasizing about. You can XRSTORS five
> times in a row as long as your XSTATE memory image is correct.

I'm just reading TFM, which is some kind of dystopian fantasy.

11.8.2.4 XRSTORS

Before restoring the user-interrupt state component, XRSTORS verifies that UINV is 0. If it is not, XRSTORS
causes a general-protection fault (#GP) before loading any part of the user-interrupt state component. (UINV
is IA32_UINTR_MISC[39:32]; XRSTORS does not check the contents of the remainder of that MSR.)

So if UINV is set in the memory image and you XRSTORS five times in a row, the first one will work assuming UINV was zero.  The second one will #GP.  And:

11.8.2.3 XSAVES
After saving the user-interrupt state component, XSAVES clears UINV. (UINV is IA32_UINTR_MISC[39:32];
XSAVES does not modify the remainder of that MSR.)

So if we're running a UPID-enabled user task and we switch to a kernel thread, we do XSAVES and UINV is cleared.  Then we switch back to the same task and don't do XRSTORS (or otherwise write IA32_UINTR_MISC) and UINV is still clear.

And we had better clear UINV when running a kernel thread because the UPID might get freed or the kernel thread might do some CPL3 shenanigans (via EFI, perhaps? I don't know if any firmwares actually do this).

So all this seems to put UINV into the "independent" category of feature along with LBR.  And the 512-byte wastes from extra copies of the legacy area and the loss of the XMODIFIED optimization will just be collateral damage.
Sohil Mehta Oct. 1, 2021, 6:04 p.m. UTC | #16
On 10/1/2021 8:13 AM, Andy Lutomirski wrote:
>
> I'm just reading TFM, which is some kind of dystopian fantasy.
>
> 11.8.2.4 XRSTORS
>
> Before restoring the user-interrupt state component, XRSTORS verifies that UINV is 0. If it is not, XRSTORS
> causes a general-protection fault (#GP) before loading any part of the user-interrupt state component. (UINV
> is IA32_UINTR_MISC[39:32]; XRSTORS does not check the contents of the remainder of that MSR.)
>
> So if UINV is set in the memory image and you XRSTORS five times in a row, the first one will work assuming UINV was zero.  The second one will #GP.  And:
>
> 11.8.2.3 XSAVES
> After saving the user-interrupt state component, XSAVES clears UINV. (UINV is IA32_UINTR_MISC[39:32];
> XSAVES does not modify the remainder of that MSR.)
>
> So if we're running a UPID-enabled user task and we switch to a kernel thread, we do XSAVES and UINV is cleared.  Then we switch back to the same task and don't do XRSTORS (or otherwise write IA32_UINTR_MISC) and UINV is still clear.

Andy,

I am still catching up with the rest of the discussion but I wanted to 
provide some input here.

Have you had a chance to look at the discussion on this topic in patch 5?
https://lore.kernel.org/lkml/87bl4fcxz8.ffs@tglx/
The pseudo code Thomas provided and my comments on the same cover the 
above situation.

The UINV bits in the IA32_UINTR_MISC act as an on/off switch for 
detecting user interrupts (i.e. moving them from UPID.PIR to UIRR). When 
XSAVES saves UIRR into memory we want the switch to atomically turn off 
to stop detecting additional interrupts. When we restore the state back 
the hardware wants to be sure the switch is off before writing to UIRR. 
If not, the UIRR state could potentially be overwritten.

That's how I understand the XSAVES/XRSTORS behavior. I can confirm with 
the hardware architects if you want more details here.

Regarding the #GP trick proposal, I am planning to get some feedback 
from the hardware folks to see if any potential issues could arise.

I am on a pre-planned break next week. I apologize (in advance) for the 
delay in responding.

Thanks,
Sohil
Thomas Gleixner Oct. 1, 2021, 9:29 p.m. UTC | #17
On Fri, Oct 01 2021 at 08:13, Andy Lutomirski wrote:

> On Fri, Oct 1, 2021, at 2:56 AM, Thomas Gleixner wrote:
>> On Thu, Sep 30 2021 at 21:41, Andy Lutomirski wrote:
>>> On Thu, Sep 30, 2021, at 5:01 PM, Thomas Gleixner wrote:
>>
>>> Now that I read the docs some more, I'm seriously concerned about this
>>> XSAVE design.  XSAVES with UINTR is destructive -- it clears UINV.  If
>>> we actually use this, then the whole last_cpu "preserve the state in
>>> registers" optimization goes out the window.  So does anything that
>>> happens to assume that merely saving the state doesn't destroy it on
>>> respectable modern CPUs XRSTORS will #GP if you XRSTORS twice, which
>>> makes me nervous and would need a serious audit of our XRSTORS paths.
>>
>> I have no idea what you are fantasizing about. You can XRSTORS five
>> times in a row as long as your XSTATE memory image is correct.
>
> I'm just reading TFM, which is some kind of dystopian fantasy.
>
> 11.8.2.4 XRSTORS
>
> Before restoring the user-interrupt state component, XRSTORS verifies
> that UINV is 0. If it is not, XRSTORS causes a general-protection
> fault (#GP) before loading any part of the user-interrupt state
> component. (UINV is IA32_UINTR_MISC[39:32]; XRSTORS does not check the
> contents of the remainder of that MSR.)

Duh. I was staring at the SDM and searching for a hint. Stupid me!

> So if UINV is set in the memory image and you XRSTORS five times in a
> row, the first one will work assuming UINV was zero.  The second one
> will #GP.

Yes. I can see what you mean now :)

> 11.8.2.3 XSAVES
> After saving the user-interrupt state component, XSAVES clears UINV. (UINV is IA32_UINTR_MISC[39:32];
> XSAVES does not modify the remainder of that MSR.)
>
> So if we're running a UPID-enabled user task and we switch to a kernel
> thread, we do XSAVES and UINV is cleared.  Then we switch back to the
> same task and don't do XRSTORS (or otherwise write IA32_UINTR_MISC)
> and UINV is still clear.

Yes, that has to be mopped up on the way to user space.

> And we had better clear UINV when running a kernel thread because the
> UPID might get freed or the kernel thread might do some CPL3
> shenanigans (via EFI, perhaps? I don't know if any firmwares actually
> do this).

Right. That's what happens already with the current pile.

> So all this seems to put UINV into the "independent" category of
> feature along with LBR.  And the 512-byte wastes from extra copies of
> the legacy area and the loss of the XMODIFIED optimization will just
> be collateral damage.

So we'd end up with two XSAVES on context switch. We can simply do:

        XSAVES();
        fpu.state.xtsate.uintr.uinv = 0;

which allows to do as many XRSTORS in a row as we want. Only the final
one on the way to user space will have to restore the real vector if the
register state is not valid:

       if (fpu_state_valid()) {
            if (needs_uinv(current)
               wrmsrl(UINV, vector);
       } else {
            if (needs_uinv(current)
               fpu.state.xtsate.uintr.uinv = vector;
            XRSTORS();
       }

Hmm?

Thanks,

        tglx
Sohil Mehta Oct. 1, 2021, 11 p.m. UTC | #18
On 10/1/2021 2:29 PM, Thomas Gleixner wrote:
> So we'd end up with two XSAVES on context switch. We can simply do:
>          XSAVES();
>          fpu.state.xtsate.uintr.uinv = 0;


I am a bit confused. Do we need to set UINV to 0 explicitly?

If XSAVES gets called twice during context switch then the UINV in the 
XSTATE buffer automatically gets set to 0. Since XSAVES saves the 
current UINV value in the MISC_MSR which was already set to 0 by the 
previous XSAVES.

Though, this probably happens due to pure luck than intentional design :)

> which allows to do as many XRSTORS in a row as we want. Only the final
> one on the way to user space will have to restore the real vector if the
> register state is not valid:
>
>         if (fpu_state_valid()) {
>              if (needs_uinv(current)
>                 wrmsrl(UINV, vector);
>         } else {
>              if (needs_uinv(current)
>                 fpu.state.xtsate.uintr.uinv = vector;
>              XRSTORS();
>         }

I might have missed some subtle difference. Has this logic changed from 
what you previously suggested for arch_exit_to_user_mode_prepare()?

        if (xrstors_pending)) {
             // Update the saved xstate for xrstors
             // Unconditionally update the UINV since it could have been 
overwritten by calling XSAVES twice.
                current->xstate.uintr.uinv = UINTR_NOTIFICATION_VECTOR;
                 current->xstate.uintr.uirr |= pir;
         } else {
                 // Manually restore UIRR and UINV
                 rdmsrl(IA32_UINTR_RR, uirr);
                 wrmsrl(IA32_UINTR_RR, uirr | pir);

             misc.val64 = 0;
                 misc.uittsz = current->uintr->uittsz;
                 misc.uinv = UINTR_NOTIFICATION_VECTOR;
                 wrmsrl(IA32_UINTR_MISC, misc.val64);
         }

> Hmm?


The one case I can see this failing is if there was another XRSTORS 
after the "final" restore in arch_exit_to_user_mode_prepare()? I think 
that is not possible but I am not an expert on this. Did I misunderstand 
something?

Thanks,
Sohil
Andy Lutomirski Oct. 1, 2021, 11:04 p.m. UTC | #19
> On Oct 1, 2021, at 2:29 PM, Thomas Gleixner <tglx@linutronix.de> wrote:
> 
> On Fri, Oct 01 2021 at 08:13, Andy Lutomirski wrote:
> 
>>> On Fri, Oct 1, 2021, at 2:56 AM, Thomas Gleixner wrote:
>>> On Thu, Sep 30 2021 at 21:41, Andy Lutomirski wrote:
>>>>> On Thu, Sep 30, 2021, at 5:01 PM, Thomas Gleixner wrote:
>>> 
>>>> Now that I read the docs some more, I'm seriously concerned about this
>>>> XSAVE design.  XSAVES with UINTR is destructive -- it clears UINV.  If
>>>> we actually use this, then the whole last_cpu "preserve the state in
>>>> registers" optimization goes out the window.  So does anything that
>>>> happens to assume that merely saving the state doesn't destroy it on
>>>> respectable modern CPUs XRSTORS will #GP if you XRSTORS twice, which
>>>> makes me nervous and would need a serious audit of our XRSTORS paths.
>>> 
>>> I have no idea what you are fantasizing about. You can XRSTORS five
>>> times in a row as long as your XSTATE memory image is correct.
>> 
>> I'm just reading TFM, which is some kind of dystopian fantasy.
>> 
>> 11.8.2.4 XRSTORS
>> 
>> Before restoring the user-interrupt state component, XRSTORS verifies
>> that UINV is 0. If it is not, XRSTORS causes a general-protection
>> fault (#GP) before loading any part of the user-interrupt state
>> component. (UINV is IA32_UINTR_MISC[39:32]; XRSTORS does not check the
>> contents of the remainder of that MSR.)
> 
> Duh. I was staring at the SDM and searching for a hint. Stupid me!
> 
>> So if UINV is set in the memory image and you XRSTORS five times in a
>> row, the first one will work assuming UINV was zero.  The second one
>> will #GP.
> 
> Yes. I can see what you mean now :)
> 
>> 11.8.2.3 XSAVES
>> After saving the user-interrupt state component, XSAVES clears UINV. (UINV is IA32_UINTR_MISC[39:32];
>> XSAVES does not modify the remainder of that MSR.)
>> 
>> So if we're running a UPID-enabled user task and we switch to a kernel
>> thread, we do XSAVES and UINV is cleared.  Then we switch back to the
>> same task and don't do XRSTORS (or otherwise write IA32_UINTR_MISC)
>> and UINV is still clear.
> 
> Yes, that has to be mopped up on the way to user space.
> 
>> And we had better clear UINV when running a kernel thread because the
>> UPID might get freed or the kernel thread might do some CPL3
>> shenanigans (via EFI, perhaps? I don't know if any firmwares actually
>> do this).
> 
> Right. That's what happens already with the current pile.
> 
>> So all this seems to put UINV into the "independent" category of
>> feature along with LBR.  And the 512-byte wastes from extra copies of
>> the legacy area and the loss of the XMODIFIED optimization will just
>> be collateral damage.
> 
> So we'd end up with two XSAVES on context switch. We can simply do:
> 
>        XSAVES();
>        fpu.state.xtsate.uintr.uinv = 0;

Could work. As long as UINV is armed, RR can change at any time (maybe just when IF=1? The manual is unclear).  But the first XSAVES disarms UINV, so maybe this won’t confuse any callers.

> 
> which allows to do as many XRSTORS in a row as we want. Only the final
> one on the way to user space will have to restore the real vector if the
> register state is not valid:
> 
>       if (fpu_state_valid()) {
>            if (needs_uinv(current)
>               wrmsrl(UINV, vector);
>       } else {
>            if (needs_uinv(current)
>               fpu.state.xtsate.uintr.uinv = vector;
>            XRSTORS();
>       }
> 
> Hmm?

I like it better than anything else I’ve seen.

> 
> Thanks,
> 
>        tglx
diff mbox series

Patch

diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 279afc01f1ac..a4623fdb65a1 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -22,6 +22,7 @@  typedef struct {
 #endif
 #ifdef CONFIG_X86_USER_INTERRUPTS
 	unsigned int uintr_spurious_count;
+	unsigned int uintr_kernel_notifications;
 #endif
 	unsigned int x86_platform_ipis;	/* arch dependent */
 	unsigned int apic_perf_irqs;
diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index 5929a6f9eeee..0ac7ef592283 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -673,6 +673,7 @@  DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_NESTED_VECTOR,	sysvec_kvm_posted_intr_nested
 
 #ifdef CONFIG_X86_USER_INTERRUPTS
 DECLARE_IDTENTRY_SYSVEC(UINTR_NOTIFICATION_VECTOR,	sysvec_uintr_spurious_interrupt);
+DECLARE_IDTENTRY_SYSVEC(UINTR_KERNEL_VECTOR,		sysvec_uintr_kernel_notification);
 #endif
 
 #if IS_ENABLED(CONFIG_HYPERV)
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index d26faa504931..1d289b3ee0da 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -106,8 +106,9 @@ 
 
 /* Vector for User interrupt notifications */
 #define UINTR_NOTIFICATION_VECTOR       0xec
+#define UINTR_KERNEL_VECTOR		0xeb
 
-#define LOCAL_TIMER_VECTOR		0xeb
+#define LOCAL_TIMER_VECTOR		0xea
 
 #define NR_VECTORS			 256
 
diff --git a/arch/x86/include/asm/uintr.h b/arch/x86/include/asm/uintr.h
index ef3521dd7fb9..64113ef523ca 100644
--- a/arch/x86/include/asm/uintr.h
+++ b/arch/x86/include/asm/uintr.h
@@ -4,11 +4,29 @@ 
 
 #ifdef CONFIG_X86_USER_INTERRUPTS
 
+/* User Posted Interrupt Descriptor (UPID) */
+struct uintr_upid {
+	struct {
+		u8 status;	/* bit 0: ON, bit 1: SN, bit 2-7: reserved */
+		u8 reserved1;	/* Reserved */
+		u8 nv;		/* Notification vector */
+		u8 reserved2;	/* Reserved */
+		u32 ndst;	/* Notification destination */
+	} nc __packed;		/* Notification control */
+	u64 puir;		/* Posted user interrupt requests */
+} __aligned(64);
+
+/* UPID Notification control status */
+#define UPID_ON		0x0	/* Outstanding notification */
+#define UPID_SN		0x1	/* Suppressed notification */
+
 struct uintr_upid_ctx {
+	struct list_head node;
 	struct task_struct *task;	/* Receiver task */
 	struct uintr_upid *upid;
 	refcount_t refs;
 	bool receiver_active;		/* Flag for UPID being mapped to a receiver */
+	bool waiting;
 };
 
 struct uintr_receiver_info {
@@ -43,11 +61,15 @@  void uintr_free(struct task_struct *task);
 void switch_uintr_prepare(struct task_struct *prev);
 void switch_uintr_return(void);
 
+int uintr_receiver_wait(void);
+void uintr_wake_up_process(void);
+
 #else /* !CONFIG_X86_USER_INTERRUPTS */
 
 static inline void uintr_free(struct task_struct *task) {}
 static inline void switch_uintr_prepare(struct task_struct *prev) {}
 static inline void switch_uintr_return(void) {}
+static inline void uintr_wake_up_process(void) {}
 
 #endif /* CONFIG_X86_USER_INTERRUPTS */
 
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index d8c45e0728f0..8d4fd7509523 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -149,6 +149,7 @@  static const __initconst struct idt_data apic_idts[] = {
 # endif
 #ifdef CONFIG_X86_USER_INTERRUPTS
 	INTG(UINTR_NOTIFICATION_VECTOR,		asm_sysvec_uintr_spurious_interrupt),
+	INTG(UINTR_KERNEL_VECTOR,		asm_sysvec_uintr_kernel_notification),
 #endif
 # ifdef CONFIG_IRQ_WORK
 	INTG(IRQ_WORK_VECTOR,			asm_sysvec_irq_work),
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index e3c35668c7c5..22349f5c301b 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -22,6 +22,7 @@ 
 #include <asm/desc.h>
 #include <asm/traps.h>
 #include <asm/thermal.h>
+#include <asm/uintr.h>
 
 #define CREATE_TRACE_POINTS
 #include <asm/trace/irq_vectors.h>
@@ -187,6 +188,11 @@  int arch_show_interrupts(struct seq_file *p, int prec)
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->uintr_spurious_count);
 	seq_puts(p, "  User-interrupt spurious event\n");
+
+	seq_printf(p, "%*s: ", prec, "UKN");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", irq_stats(j)->uintr_kernel_notifications);
+	seq_puts(p, "  User-interrupt kernel notification event\n");
 #endif
 	return 0;
 }
@@ -356,6 +362,18 @@  DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_uintr_spurious_interrupt)
 	ack_APIC_irq();
 	inc_irq_stat(uintr_spurious_count);
 }
+
+/*
+ * Handler for UINTR_KERNEL_VECTOR.
+ */
+DEFINE_IDTENTRY_SYSVEC(sysvec_uintr_kernel_notification)
+{
+	/* TODO: Add entry-exit tracepoints */
+	ack_APIC_irq();
+	inc_irq_stat(uintr_kernel_notifications);
+
+	uintr_wake_up_process();
+}
 #endif
 
 
diff --git a/arch/x86/kernel/uintr_core.c b/arch/x86/kernel/uintr_core.c
index 8f331c5fe0cf..4e5545e6d903 100644
--- a/arch/x86/kernel/uintr_core.c
+++ b/arch/x86/kernel/uintr_core.c
@@ -28,22 +28,6 @@ 
 #define UINTR_MAX_UITT_NR 256
 #define UINTR_MAX_UVEC_NR 64
 
-/* User Posted Interrupt Descriptor (UPID) */
-struct uintr_upid {
-	struct {
-		u8 status;	/* bit 0: ON, bit 1: SN, bit 2-7: reserved */
-		u8 reserved1;	/* Reserved */
-		u8 nv;		/* Notification vector */
-		u8 reserved2;	/* Reserved */
-		u32 ndst;	/* Notification destination */
-	} nc __packed;		/* Notification control */
-	u64 puir;		/* Posted user interrupt requests */
-} __aligned(64);
-
-/* UPID Notification control status */
-#define UPID_ON		0x0	/* Outstanding notification */
-#define UPID_SN		0x1	/* Suppressed notification */
-
 struct uintr_receiver {
 	struct uintr_upid_ctx *upid_ctx;
 	u64 uvec_mask;	/* track active vector per bit */
@@ -70,6 +54,10 @@  struct uintr_sender {
 	u64 uitt_mask[BITS_TO_U64(UINTR_MAX_UITT_NR)];
 };
 
+/* TODO: To remove the global lock, move to a per-cpu wait list. */
+static DEFINE_SPINLOCK(uintr_wait_lock);
+static struct list_head uintr_wait_list = LIST_HEAD_INIT(uintr_wait_list);
+
 inline bool uintr_arch_enabled(void)
 {
 	return static_cpu_has(X86_FEATURE_UINTR);
@@ -80,6 +68,12 @@  static inline bool is_uintr_receiver(struct task_struct *t)
 	return !!t->thread.ui_recv;
 }
 
+/* Always make sure task is_uintr_receiver() before calling */
+static inline bool is_uintr_waiting(struct task_struct *t)
+{
+	return t->thread.ui_recv->upid_ctx->waiting;
+}
+
 static inline bool is_uintr_sender(struct task_struct *t)
 {
 	return !!t->thread.ui_send;
@@ -151,6 +145,7 @@  static struct uintr_upid_ctx *alloc_upid(void)
 	refcount_set(&upid_ctx->refs, 1);
 	upid_ctx->task = get_task_struct(current);
 	upid_ctx->receiver_active = true;
+	upid_ctx->waiting = false;
 
 	return upid_ctx;
 }
@@ -494,6 +489,68 @@  int do_uintr_register_sender(struct uintr_receiver_info *r_info,
 	return 0;
 }
 
+int uintr_receiver_wait(void)
+{
+	struct uintr_upid_ctx *upid_ctx;
+	unsigned long flags;
+
+	if (!is_uintr_receiver(current))
+		return -EOPNOTSUPP;
+
+	upid_ctx = current->thread.ui_recv->upid_ctx;
+	upid_ctx->upid->nc.nv = UINTR_KERNEL_VECTOR;
+	upid_ctx->waiting = true;
+	spin_lock_irqsave(&uintr_wait_lock, flags);
+	list_add(&upid_ctx->node, &uintr_wait_list);
+	spin_unlock_irqrestore(&uintr_wait_lock, flags);
+
+	set_current_state(TASK_INTERRUPTIBLE);
+	schedule();
+
+	return -EINTR;
+}
+
+/*
+ * Runs in interrupt context.
+ * Scan through all UPIDs to check if any interrupt is on going.
+ */
+void uintr_wake_up_process(void)
+{
+	struct uintr_upid_ctx *upid_ctx, *tmp;
+	unsigned long flags;
+
+	spin_lock_irqsave(&uintr_wait_lock, flags);
+	list_for_each_entry_safe(upid_ctx, tmp, &uintr_wait_list, node) {
+		if (test_bit(UPID_ON, (unsigned long *)&upid_ctx->upid->nc.status)) {
+			set_bit(UPID_SN, (unsigned long *)&upid_ctx->upid->nc.status);
+			upid_ctx->upid->nc.nv = UINTR_NOTIFICATION_VECTOR;
+			upid_ctx->waiting = false;
+			wake_up_process(upid_ctx->task);
+			list_del(&upid_ctx->node);
+		}
+	}
+	spin_unlock_irqrestore(&uintr_wait_lock, flags);
+}
+
+/* Called when task is unregistering/exiting */
+static void uintr_remove_task_wait(struct task_struct *task)
+{
+	struct uintr_upid_ctx *upid_ctx, *tmp;
+	unsigned long flags;
+
+	spin_lock_irqsave(&uintr_wait_lock, flags);
+	list_for_each_entry_safe(upid_ctx, tmp, &uintr_wait_list, node) {
+		if (upid_ctx->task == task) {
+			pr_debug("wait: Removing task %d from wait\n",
+				 upid_ctx->task->pid);
+			upid_ctx->upid->nc.nv = UINTR_NOTIFICATION_VECTOR;
+			upid_ctx->waiting = false;
+			list_del(&upid_ctx->node);
+		}
+	}
+	spin_unlock_irqrestore(&uintr_wait_lock, flags);
+}
+
 int do_uintr_unregister_handler(void)
 {
 	struct task_struct *t = current;
@@ -548,7 +605,7 @@  int do_uintr_unregister_handler(void)
 	 * based on this UPID.
 	 */
 	set_bit(UPID_SN, (unsigned long *)&ui_recv->upid_ctx->upid->nc.status);
-
+	uintr_remove_task_wait(t);
 	put_upid_ref(ui_recv->upid_ctx);
 	kfree(ui_recv);
 	t->thread.ui_recv = NULL;
@@ -684,7 +741,7 @@  void switch_uintr_prepare(struct task_struct *prev)
 {
 	struct uintr_upid *upid;
 
-	if (is_uintr_receiver(prev)) {
+	if (is_uintr_receiver(prev) && !is_uintr_waiting(prev)) {
 		upid = prev->thread.ui_recv->upid_ctx->upid;
 		set_bit(UPID_SN, (unsigned long *)&upid->nc.status);
 	}
@@ -806,6 +863,7 @@  void uintr_free(struct task_struct *t)
 		 * generated based on this UPID.
 		 */
 		set_bit(UPID_SN, (unsigned long *)&ui_recv->upid_ctx->upid->nc.status);
+		uintr_remove_task_wait(t);
 		ui_recv->upid_ctx->receiver_active = false;
 		put_upid_ref(ui_recv->upid_ctx);
 		kfree(ui_recv);
diff --git a/arch/x86/kernel/uintr_fd.c b/arch/x86/kernel/uintr_fd.c
index 3c82c032c0b9..a7e55d98c0c7 100644
--- a/arch/x86/kernel/uintr_fd.c
+++ b/arch/x86/kernel/uintr_fd.c
@@ -283,3 +283,18 @@  SYSCALL_DEFINE2(uintr_unregister_sender, int, uintrfd, unsigned int, flags)
 	fdput(f);
 	return ret;
 }
+
+/*
+ * sys_uintr_wait - Wait for a user interrupt
+ */
+SYSCALL_DEFINE1(uintr_wait, unsigned int, flags)
+{
+	if (!uintr_arch_enabled())
+		return -EOPNOTSUPP;
+
+	if (flags)
+		return -EINVAL;
+
+	/* TODO: Add a timeout option */
+	return uintr_receiver_wait();
+}