diff mbox

[v3,11/11] KVM: nVMX: Wake L2 from HLT when nested posted-interrupt pending

Message ID 1514131983-24305-12-git-send-email-liran.alon@oracle.com (mailing list archive)
State New, archived
Headers show

Commit Message

Liran Alon Dec. 24, 2017, 4:13 p.m. UTC
If L1 doesn't intercept L2 HLT (doesn't set CPU_BASED_HLT_EXITING),
then when L2 executes HLT instruction, KVM will block vCPU from
further execution (just like what happens when L1 executes HLT).

Consider the case where vmx_deliver_nested_posted_interrupt() sees
that vcpu->mode == IN_GUEST_MODE and therefore sends a physical IPI.
Assume that before it sends the physical IPI, the dest CPU executing
L2 executes HLT which causes the dest vCPU to be blocked.

Before the dest vCPU is blocked, a call to kvm_arch_vcpu_runnable()
is made which calls kvm_vcpu_has_events(). In addition, after the
vCPU is blocked, every time it is interrupted (by IPI) then
kvm_vcpu_check_block() is called which also calls
kvm_arch_vcpu_runnable().
kvm_vcpu_has_events() should return if there is a pending event for
vCPU that should be processed and therefore vCPU should be unblocked.

In order to handle the above mentioned case, kvm_vcpu_has_events()
should check if there is a pending nested posted-interrupt on vCPU
that should be dispatched to guest.

Fixes: 705699a13994 ("KVM: nVMX: Enable nested posted interrupt
processing")

Signed-off-by: Liran Alon <liran.alon@oracle.com>
Reviewed-by: Nikita Leshenko <nikita.leshchenko@oracle.com>
Reviewed-by: Liam Merwick <liam.merwick@oracle.com>
Signed-off-by: Liam Merwick <liam.merwick@oracle.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/svm.c              |  7 +++++++
 arch/x86/kvm/vmx.c              | 13 +++++++++++++
 arch/x86/kvm/x86.c              |  3 ++-
 4 files changed, 23 insertions(+), 1 deletion(-)

Comments

Paolo Bonzini Dec. 27, 2017, 10:15 a.m. UTC | #1
On 24/12/2017 17:13, Liran Alon wrote:
> +static bool vmx_cpu_has_nested_posted_interrupt(struct kvm_vcpu *vcpu)
> +{
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +
> +	return (vcpu->arch.apicv_active &&
> +		is_guest_mode(vcpu) &&
> +		vmx->nested.pi_pending &&
> +		vmx->nested.pi_desc &&
> +		pi_test_on(vmx->nested.pi_desc));
> +}
> +
>  /*
>   * Set up the vmcs's constant host-state fields, i.e., host-state fields that
>   * will not change in the lifetime of the guest.
> @@ -12142,6 +12153,8 @@ static int enable_smi_window(struct kvm_vcpu *vcpu)
>  	.deliver_posted_interrupt = vmx_deliver_posted_interrupt,
>  	.complete_nested_posted_interrupt =
>  		vmx_complete_nested_posted_interrupt,
> +	.cpu_has_nested_posted_interrupt =
> +		vmx_cpu_has_nested_posted_interrupt,
>  
>  	.set_tss_addr = vmx_set_tss_addr,
>  	.get_tdp_level = get_ept_level,
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index fa088951afc9..a840f2c9bd66 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -8542,7 +8542,8 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
>  		return true;
>  
>  	if (kvm_arch_interrupt_allowed(vcpu) &&
> -	    kvm_cpu_has_interrupt(vcpu))
> +	    (kvm_cpu_has_interrupt(vcpu) ||
> +	     kvm_x86_ops->cpu_has_nested_posted_interrupt(vcpu)))
>  		return true;


kvm_cpu_has_interrupt ultimately calls apic_has_interrupt_for_ppr, which 
calls kvm_x86_ops->sync_pir_to_irr.

You already have

+		if (is_guest_mode(vcpu))
+			kvm_x86_ops->complete_nested_posted_interrupt(vcpu);

earlier in the series right after a call to kvm_x86_ops->sync_pir_to_irr.

So I wonder if:

1) kvm_x86_ops->complete_nested_posted_interrupt would do the job here as
well, removing the need for the new kvm_x86_ops member;

2) The call to kvm_x86_ops->complete_nested_posted_interrupt actually
applies to all callers of sync_pir_to_irr, which would remove the need for
that other kvm_x86_ops member.

I'm leaning towards applying patches 1-4, what do you think?

Paolo
Liran Alon Dec. 27, 2017, 10:51 a.m. UTC | #2
On 27/12/17 12:15, Paolo Bonzini wrote:
> On 24/12/2017 17:13, Liran Alon wrote:
>> +static bool vmx_cpu_has_nested_posted_interrupt(struct kvm_vcpu *vcpu)
>> +{
>> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
>> +
>> +	return (vcpu->arch.apicv_active &&
>> +		is_guest_mode(vcpu) &&
>> +		vmx->nested.pi_pending &&
>> +		vmx->nested.pi_desc &&
>> +		pi_test_on(vmx->nested.pi_desc));
>> +}
>> +
>>   /*
>>    * Set up the vmcs's constant host-state fields, i.e., host-state fields that
>>    * will not change in the lifetime of the guest.
>> @@ -12142,6 +12153,8 @@ static int enable_smi_window(struct kvm_vcpu *vcpu)
>>   	.deliver_posted_interrupt = vmx_deliver_posted_interrupt,
>>   	.complete_nested_posted_interrupt =
>>   		vmx_complete_nested_posted_interrupt,
>> +	.cpu_has_nested_posted_interrupt =
>> +		vmx_cpu_has_nested_posted_interrupt,
>>
>>   	.set_tss_addr = vmx_set_tss_addr,
>>   	.get_tdp_level = get_ept_level,
>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> index fa088951afc9..a840f2c9bd66 100644
>> --- a/arch/x86/kvm/x86.c
>> +++ b/arch/x86/kvm/x86.c
>> @@ -8542,7 +8542,8 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
>>   		return true;
>>
>>   	if (kvm_arch_interrupt_allowed(vcpu) &&
>> -	    kvm_cpu_has_interrupt(vcpu))
>> +	    (kvm_cpu_has_interrupt(vcpu) ||
>> +	     kvm_x86_ops->cpu_has_nested_posted_interrupt(vcpu)))
>>   		return true;
>
>
> kvm_cpu_has_interrupt ultimately calls apic_has_interrupt_for_ppr, which
> calls kvm_x86_ops->sync_pir_to_irr.
>
> You already have
>
> +		if (is_guest_mode(vcpu))
> +			kvm_x86_ops->complete_nested_posted_interrupt(vcpu);
>
> earlier in the series right after a call to kvm_x86_ops->sync_pir_to_irr.
>
> So I wonder if:
>
> 1) kvm_x86_ops->complete_nested_posted_interrupt would do the job here as
> well, removing the need for the new kvm_x86_ops member;
>
> 2) The call to kvm_x86_ops->complete_nested_posted_interrupt actually
> applies to all callers of sync_pir_to_irr, which would remove the need for
> that other kvm_x86_ops member.

Maybe I misunderstand you, but I don't think this is true.
complete_nested_posted_interrupt() relies on being called at a very 
specific call-site: Right before VMEntry and after interrupts are 
disabled. It works by issue a self-IPI to cause CPU to actually process 
the nested posted-interrupts.

In addition, I don't see how we can utilize the fact that 
kvm_cpu_has_interrupt() calls apic_has_interrupt_for_ppr() as 
vmx.nested.pi_desc->pir should never be synced into L0 vLAPIC IRR.
In contrast to vmx.pi_desc->pir which does after sync_pir_to_irr().

>
> I'm leaning towards applying patches 1-4, what do you think?
>

I don't see any reason not to do so if it passes your review :)

Logically these patches are separated from the patches we still debate on.

Regards,
-Liran

> Paolo
>
Paolo Bonzini Dec. 27, 2017, 12:55 p.m. UTC | #3
On 27/12/2017 11:51, Liran Alon wrote:
> Maybe I misunderstand you, but I don't think this is true.
> complete_nested_posted_interrupt() relies on being called at a very
> specific call-site: Right before VMEntry and after interrupts are
> disabled. It works by issue a self-IPI to cause CPU to actually process
> the nested posted-interrupts.
>
> In addition, I don't see how we can utilize the fact that
> kvm_cpu_has_interrupt() calls apic_has_interrupt_for_ppr() as
> vmx.nested.pi_desc->pir should never be synced into L0 vLAPIC IRR.
> In contrast to vmx.pi_desc->pir which does after sync_pir_to_irr().

You're right, the callbacks must stay.  What confused me is the
reference to pi_test_on in vmx_cpu_has_nested_posted_interrupt.  While
your patches are an improvement, there is still some confusion on what
leads to injecting a nested posted interrupt, and on the split between
"deliver" and "complete":

- checking pi_test_on should only be done when consuming the L2 PI
descriptor (which in your case is done by the processor via the
self-IPI).  So you don't need to test it before sending the self-IPI.

- vmx->nested.pi_pending should not be set in
vmx_deliver_nested_posted_interrupt after patch 9.  pi_pending should
only be set by the nested PI handler, when it is invoked outside guest
mode.  It is then consumed before doing the self-IPI as in the above sketch.

- likewise, vmx_cpu_has_nested_posted_interrupt should only check
vmx->nested.pi_pending.  I don't think it should even check
is_guest_mode(vcpu) or vmx->nested.pi_desc; compare with the handling of
KVM_REQ_NMI, it doesn't test nmi_allowed.  This leads me to the next point.

- vmx_complete_nested_posted_interrupt *must* find a valid descriptor.
Even after a vmexit, because vmx_complete_nested_posted_interrupt is the
very first thing that runs and L1 has had no time to run a vmwrite.  So
the above could become:

	if (is_guest_mode(vcpu) && vmx->nested.pi_pending) {
		vmx->nested.pi_pending = false;
		apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
				    POSTED_INTR_NESTED_VECTOR);
	}

and then what about that is_guest_mode check?  When L1 sent the PI
notification, it thought that the destination VCPU was in guest mode.
Upon vmexit, L1 can expect to either see ON=0 or get the notification
interrupt itself.  So either we deliver the interrupt to L1, or we must
do L2 virtual interrupt delivery before L1 even runs.  Hence IMO the
is_guest_mode is wrong, which unfortunately throws a wrench somewhat in
the self-IPI idea quite a bit.

I agree with moving vmx_complete_nested_posted_interrupt out of
check_nested_events, and also with the simplification of
vmx_hwapic_irr_update.  However, in my opinion the self-IPI is actually
complicating things.  Once the handling of vmx->nested.pi_pending is
cleaned up (the nested PI interrupt handler in patch 9 is a great
improvement in that respect), we can move
vmx_complete_nested_posted_interrupt to a new KVM_REQ_NESTED_PI request
that replaces vmx->nested.pi_pending.  The GUEST_INTR_STATUS write can
be done in the vmcs12, and will then be copied by prepare_vmcs02 to the
vmcs02.

I may be wrong, but based on my experience cleaning up the bare-metal
APICv, I suspect that both the current code and this version are overly
complicated, and there is a nice and simple core waiting to be
extracted.  Let's fix the race condition in the simple way, and
separately focus on cleaning up pi_pending and everything that follows
from there.

Paolo
Liran Alon Dec. 27, 2017, 3:15 p.m. UTC | #4
On 27/12/17 14:55, Paolo Bonzini wrote:
> On 27/12/2017 11:51, Liran Alon wrote:
>> Maybe I misunderstand you, but I don't think this is true.
>> complete_nested_posted_interrupt() relies on being called at a very
>> specific call-site: Right before VMEntry and after interrupts are
>> disabled. It works by issue a self-IPI to cause CPU to actually process
>> the nested posted-interrupts.
>>
>> In addition, I don't see how we can utilize the fact that
>> kvm_cpu_has_interrupt() calls apic_has_interrupt_for_ppr() as
>> vmx.nested.pi_desc->pir should never be synced into L0 vLAPIC IRR.
>> In contrast to vmx.pi_desc->pir which does after sync_pir_to_irr().
>
> You're right, the callbacks must stay.  What confused me is the
> reference to pi_test_on in vmx_cpu_has_nested_posted_interrupt.  While
> your patches are an improvement, there is still some confusion on what
> leads to injecting a nested posted interrupt, and on the split between
> "deliver" and "complete":
>
> - checking pi_test_on should only be done when consuming the L2 PI
> descriptor (which in your case is done by the processor via the
> self-IPI).  So you don't need to test it before sending the self-IPI.

I agree we could have not check it. It is done as an optimization to 
avoid sending a self-IPI when it won't do anything because the 
vmx.nested->pi_desc.control ON bit is off.

This optimization is even more important when this code runs in L1 
(think about triple-virtualization ^_^) to avoid a #VMExit on write to 
LAPIC ICR.

>
> - vmx->nested.pi_pending should not be set in
> vmx_deliver_nested_posted_interrupt after patch 9.  pi_pending should
> only be set by the nested PI handler, when it is invoked outside guest
> mode.  It is then consumed before doing the self-IPI as in the above sketch.

I disagree.
vmx->nested.pi_pending is used as a signal to know L1 has indeed sent a 
vmcs12->posted_intr_nv IPI. If vmx_deliver_nested_posted_interrupt() 
will see target vCPU thread is OUTSIDE_GUEST_MODE then it won't send a 
physical IPI but instead only kick vCPU. In this case, the only way the 
target vCPU thread knows it should trigger self-IPI is by the fact 
pi_pending was set by vmx_deliver_nested_posted_interrupt().
(As in this case, the nested PI handler in host was never invoked).

(It cannot only rely on the vmx.nested->pi_desc.control ON bit as it may 
be set by L1 without L1 triggering a vmcs12->posted_intr_nv IPI)

>
> - likewise, vmx_cpu_has_nested_posted_interrupt should only check
> vmx->nested.pi_pending.  I don't think it should even check
> is_guest_mode(vcpu) or vmx->nested.pi_desc; compare with the handling of
> KVM_REQ_NMI, it doesn't test nmi_allowed.  This leads me to the next point.

There is an interesting question on what should happen when
(is_guest_mode(vcpu) && vmx->nested.pi_pending && 
!pi_test_on(vmx->nested.pi_desc)). I would expect L2 guest to not be 
waken from it's HLT as virtual-interrupt-delivery should not see any 
pending interrupt to inject. But I would expect that to also happen when 
PIR is empty (even if ON bit is set) which is not checked in this 
condition...

So I think I agree but we may have a small semantic issue here of waking 
the L2 vCPU unnecessarily. But this should be a very rare case so maybe 
handle it later (Write TODO comment).

>
> - vmx_complete_nested_posted_interrupt *must* find a valid descriptor.
> Even after a vmexit, because vmx_complete_nested_posted_interrupt is the
> very first thing that runs and L1 has had no time to run a vmwrite.  So

I agree. It is better to WARN in case descriptor is not valid.

> the above could become:
>
> 	if (is_guest_mode(vcpu) && vmx->nested.pi_pending) {
> 		vmx->nested.pi_pending = false;
> 		apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
> 				    POSTED_INTR_NESTED_VECTOR);
> 	}
>
> and then what about that is_guest_mode check?  When L1 sent the PI
> notification, it thought that the destination VCPU was in guest mode.
> Upon vmexit, L1 can expect to either see ON=0 or get the notification
> interrupt itself.  So either we deliver the interrupt to L1, or we must
> do L2 virtual interrupt delivery before L1 even runs.  Hence IMO the
> is_guest_mode is wrong, which unfortunately throws a wrench somewhat in
> the self-IPI idea quite a bit.
>
> I agree with moving vmx_complete_nested_posted_interrupt out of
> check_nested_events, and also with the simplification of
> vmx_hwapic_irr_update.  However, in my opinion the self-IPI is actually
> complicating things.  Once the handling of vmx->nested.pi_pending is
> cleaned up (the nested PI interrupt handler in patch 9 is a great
> improvement in that respect), we can move
> vmx_complete_nested_posted_interrupt to a new KVM_REQ_NESTED_PI request
> that replaces vmx->nested.pi_pending.  The GUEST_INTR_STATUS write can
> be done in the vmcs12, and will then be copied by prepare_vmcs02 to the
> vmcs02.
>
> I may be wrong, but based on my experience cleaning up the bare-metal
> APICv, I suspect that both the current code and this version are overly
> complicated, and there is a nice and simple core waiting to be
> extracted.  Let's fix the race condition in the simple way, and
> separately focus on cleaning up pi_pending and everything that follows
> from there.

Interesting.

I think I now follow what you mean regarding cleaning logic around 
pi_pending. This is how I understand it:

1. "vmx->nested.pi_pending" is a flag used to indicate "L1 has sent a 
vmcs12->posted_intr_nv IPI". That's it.

2. Currently code is a bit weird in the sense that instead of signal the 
pending IPI in virtual LAPIC IRR, we set it in a special variable.
If we would have set it in virtual LAPIC IRR, we could in theory behave 
very similar to a standard CPU. At interrupt injection point, we could:
(a) If vCPU is in root-mode: Just inject the pending interrupt normally.
(b) If vCPU is in non-root-mode and posted-interrupts feature is active, 
then instead of injecting the pending interrupt, we should simulate 
processing of posted-interrupts.

3. The processing of the nested posted-interrupts itself can still be 
done in self-IPI mechanism.

4. Because not doing (2), there is still currently an issue that L1 
doesn't receive a vmcs12->posted_intr_nv interrupt when target vCPU 
thread has exited from L2 to L1 and pi_pending=true.

Do we agree on the above? Or am I still misunderstanding something?
If we agree, I will attempt to clean code to handle it like described above.

Regards,
-Liran

>
> Paolo
>
Paolo Bonzini Dec. 27, 2017, 3:55 p.m. UTC | #5
On 27/12/2017 16:15, Liran Alon wrote:
> I think I now follow what you mean regarding cleaning logic around
> pi_pending. This is how I understand it:
> 
> 1. "vmx->nested.pi_pending" is a flag used to indicate "L1 has sent a
> vmcs12->posted_intr_nv IPI". That's it.
> 
> 2. Currently code is a bit weird in the sense that instead of signal the
> pending IPI in virtual LAPIC IRR, we set it in a special variable.
> If we would have set it in virtual LAPIC IRR, we could in theory behave
> very similar to a standard CPU. At interrupt injection point, we could:
> (a) If vCPU is in root-mode: Just inject the pending interrupt normally.
> (b) If vCPU is in non-root-mode and posted-interrupts feature is active,
> then instead of injecting the pending interrupt, we should simulate
> processing of posted-interrupts.
> 
> 3. The processing of the nested posted-interrupts itself can still be
> done in self-IPI mechanism.
> 
> 4. Because not doing (2), there is still currently an issue that L1
> doesn't receive a vmcs12->posted_intr_nv interrupt when target vCPU
> thread has exited from L2 to L1 and pi_pending=true.
> 
> Do we agree on the above? Or am I still misunderstanding something?

Yes, I think we agree.

Paolo
Oliver Upton Nov. 23, 2020, 7:22 p.m. UTC | #6
>On 27/12/2017 16:15, Liran Alon wrote:
>> I think I now follow what you mean regarding cleaning logic around
>> pi_pending. This is how I understand it:
>> 
>> 1. "vmx->nested.pi_pending" is a flag used to indicate "L1 has sent a
>> vmcs12->posted_intr_nv IPI". That's it.
>> 
>> 2. Currently code is a bit weird in the sense that instead of signal the
>> pending IPI in virtual LAPIC IRR, we set it in a special variable.
>> If we would have set it in virtual LAPIC IRR, we could in theory behave
>> very similar to a standard CPU. At interrupt injection point, we could:
>> (a) If vCPU is in root-mode: Just inject the pending interrupt normally.
>> (b) If vCPU is in non-root-mode and posted-interrupts feature is active,
>> then instead of injecting the pending interrupt, we should simulate
>> processing of posted-interrupts.
>> 
>> 3. The processing of the nested posted-interrupts itself can still be
>> done in self-IPI mechanism.
>> 
>> 4. Because not doing (2), there is still currently an issue that L1
>> doesn't receive a vmcs12->posted_intr_nv interrupt when target vCPU
>> thread has exited from L2 to L1 and pi_pending=true.
>> 
>> Do we agree on the above? Or am I still misunderstanding something?
>
> Yes, I think we agree.
>
> Paolo

Digging up this old thread to add discussions I had with Jim and Sean
recently.

We were looking at what was necessary in order to implement the
suggestions above (route the nested PINV through L1 IRR instead of using
the pi_pending bit), but now believe this change could never work
perfectly.

The pi_pending bit works rather well as it is only a hint to KVM that it
may owe the guest a posted-interrupt completion. However, if we were to
set the guest's nested PINV as pending in the L1 IRR it'd be challenging
to infer whether or not it should actually be injected in L1 or result
in posted-interrupt processing for L2.

A possible solution would be to install a real ISR in L0 for KVM's
nested PINV in concert with a per-CPU data structure containing a
linked-list of possible vCPUs that could've been meant to receive the
doorbell. But how would L0 know to remove a vCPU from this list? Since
posted interrupts is exitless, KVM never actually knows if a vCPU got
the intended doorbell.

Short of any brilliant ideas, it seems that the pi_pending bit
is probably here to say. I have a patch to serialize it in the
{GET,SET}_NESTED_STATE ioctls (live migration is busted for nested
posted interrupts, currently) but wanted to make sure we all agree
pi_pending isn't going anywhere.

--
Thanks,
Oliver
Paolo Bonzini Nov. 23, 2020, 10:42 p.m. UTC | #7
On 23/11/20 20:22, Oliver Upton wrote:
> The pi_pending bit works rather well as it is only a hint to KVM that it
> may owe the guest a posted-interrupt completion. However, if we were to
> set the guest's nested PINV as pending in the L1 IRR it'd be challenging
> to infer whether or not it should actually be injected in L1 or result
> in posted-interrupt processing for L2.

Stupid question: why does it matter?  The behavior when the PINV is 
delivered does not depend on the time it enters the IRR, only on the 
time that it enters ISR.  If that happens while the vCPU while in L2, it 
would trigger posted interrupt processing; if PINV moves to ISR while in 
L1, it would be delivered normally as an interrupt.

There are various special cases but they should fall in place.  For 
example, if PINV is delivered during L1 vmentry (with IF=0), it would be 
delivered at the next inject_pending_event when the VMRUN vmexit is 
processed and interrupts are unmasked.

The tricky case is when L0 tries to deliver the PINV to L1 as a posted 
interrupt, i.e. in vmx_deliver_nested_posted_interrupt.  Then the

                 if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true))
                         kvm_vcpu_kick(vcpu);

needs a tweak to fall back to setting the PINV in L1's IRR:

                 if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true)) {
                         /* set PINV in L1's IRR */
			kvm_vcpu_kick(vcpu);
		}

but you also have to do the same *in the PINV handler* 
sysvec_kvm_posted_intr_nested_ipi too, to handle the case where the 
L2->L0 vmexit races against sending the IPI.

What am I missing?

Paolo
Oliver Upton Nov. 24, 2020, 12:10 a.m. UTC | #8
On Mon, Nov 23, 2020 at 2:42 PM Paolo Bonzini <pbonzini@redhat.com> wrote:
>
> On 23/11/20 20:22, Oliver Upton wrote:
> > The pi_pending bit works rather well as it is only a hint to KVM that it
> > may owe the guest a posted-interrupt completion. However, if we were to
> > set the guest's nested PINV as pending in the L1 IRR it'd be challenging
> > to infer whether or not it should actually be injected in L1 or result
> > in posted-interrupt processing for L2.
>
> Stupid question: why does it matter?  The behavior when the PINV is
> delivered does not depend on the time it enters the IRR, only on the
> time that it enters ISR.  If that happens while the vCPU while in L2, it
> would trigger posted interrupt processing; if PINV moves to ISR while in
> L1, it would be delivered normally as an interrupt.
>
> There are various special cases but they should fall in place.  For
> example, if PINV is delivered during L1 vmentry (with IF=0), it would be
> delivered at the next inject_pending_event when the VMRUN vmexit is
> processed and interrupts are unmasked.
>
> The tricky case is when L0 tries to deliver the PINV to L1 as a posted
> interrupt, i.e. in vmx_deliver_nested_posted_interrupt.  Then the
>
>                  if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true))
>                          kvm_vcpu_kick(vcpu);
>
> needs a tweak to fall back to setting the PINV in L1's IRR:
>
>                  if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true)) {
>                          /* set PINV in L1's IRR */
>                         kvm_vcpu_kick(vcpu);
>                 }

Yeah, I think that's fair. Regardless, the pi_pending bit should've
only been set if the IPI was actually sent. Though I suppose

> but you also have to do the same *in the PINV handler*
> sysvec_kvm_posted_intr_nested_ipi too, to handle the case where the
> L2->L0 vmexit races against sending the IPI.

Indeed, there is a race but are we assured that the target vCPU thread
is scheduled on the target CPU when that IPI arrives?

I believe there is a 1-to-many relationship here, which is why I said
each CPU would need to maintain a linked list of possible vCPUs to
iterate and find the intended recipient. The process of removing vCPUs
from the list where we caught the IPI in L0 is quite clear, but it
doesn't seem like we could ever know to remove vCPUs from the list
when hardware catches that IPI.

If the ISR thing can be figured out then that'd be great, though it
seems infeasible because we are racing with scheduling on the target.

Could we split the difference and do something like:

        if (kvm_vcpu_trigger_posted_interrupt(vcpu, true)) {
                vmx->nested.pi_pending = true;
        } else {
                /* set PINV in L1's IRR */
                kvm_vcpu_kick(vcpu);
        }

which ensures we only set the hint when KVM might actually have
something to do. Otherwise, it'll deliver to L1 like a normal
interrupt or trigger posted-interrupt processing on nested VM-entry if
IF=0.

> What am I missing?
>
> Paolo
>
Oliver Upton Nov. 24, 2020, 12:13 a.m. UTC | #9
On Mon, Nov 23, 2020 at 4:10 PM Oliver Upton <oupton@google.com> wrote:
>
> On Mon, Nov 23, 2020 at 2:42 PM Paolo Bonzini <pbonzini@redhat.com> wrote:
> >
> > On 23/11/20 20:22, Oliver Upton wrote:
> > > The pi_pending bit works rather well as it is only a hint to KVM that it
> > > may owe the guest a posted-interrupt completion. However, if we were to
> > > set the guest's nested PINV as pending in the L1 IRR it'd be challenging
> > > to infer whether or not it should actually be injected in L1 or result
> > > in posted-interrupt processing for L2.
> >
> > Stupid question: why does it matter?  The behavior when the PINV is
> > delivered does not depend on the time it enters the IRR, only on the
> > time that it enters ISR.  If that happens while the vCPU while in L2, it
> > would trigger posted interrupt processing; if PINV moves to ISR while in
> > L1, it would be delivered normally as an interrupt.
> >
> > There are various special cases but they should fall in place.  For
> > example, if PINV is delivered during L1 vmentry (with IF=0), it would be
> > delivered at the next inject_pending_event when the VMRUN vmexit is
> > processed and interrupts are unmasked.
> >
> > The tricky case is when L0 tries to deliver the PINV to L1 as a posted
> > interrupt, i.e. in vmx_deliver_nested_posted_interrupt.  Then the
> >
> >                  if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true))
> >                          kvm_vcpu_kick(vcpu);
> >
> > needs a tweak to fall back to setting the PINV in L1's IRR:
> >
> >                  if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true)) {
> >                          /* set PINV in L1's IRR */
> >                         kvm_vcpu_kick(vcpu);
> >                 }
>
> Yeah, I think that's fair. Regardless, the pi_pending bit should've
> only been set if the IPI was actually sent. Though I suppose

Didn't finish my thought :-/

Though I suppose pi_pending was set unconditionally (and skipped the
IRR) since until recently KVM completely bungled handling the PINV
correctly when in the L1 IRR.

>
> > but you also have to do the same *in the PINV handler*
> > sysvec_kvm_posted_intr_nested_ipi too, to handle the case where the
> > L2->L0 vmexit races against sending the IPI.
>
> Indeed, there is a race but are we assured that the target vCPU thread
> is scheduled on the target CPU when that IPI arrives?
>
> I believe there is a 1-to-many relationship here, which is why I said
> each CPU would need to maintain a linked list of possible vCPUs to
> iterate and find the intended recipient. The process of removing vCPUs
> from the list where we caught the IPI in L0 is quite clear, but it
> doesn't seem like we could ever know to remove vCPUs from the list
> when hardware catches that IPI.
>
> If the ISR thing can be figured out then that'd be great, though it
> seems infeasible because we are racing with scheduling on the target.
>
> Could we split the difference and do something like:
>
>         if (kvm_vcpu_trigger_posted_interrupt(vcpu, true)) {
>                 vmx->nested.pi_pending = true;
>         } else {
>                 /* set PINV in L1's IRR */
>                 kvm_vcpu_kick(vcpu);
>         }
>
> which ensures we only set the hint when KVM might actually have
> something to do. Otherwise, it'll deliver to L1 like a normal
> interrupt or trigger posted-interrupt processing on nested VM-entry if
> IF=0.
>
> > What am I missing?
> >
> > Paolo
> >
Sean Christopherson Nov. 24, 2020, 1:55 a.m. UTC | #10
On Mon, Nov 23, 2020 at 04:13:49PM -0800, Oliver Upton wrote:
> On Mon, Nov 23, 2020 at 4:10 PM Oliver Upton <oupton@google.com> wrote:
> >
> > On Mon, Nov 23, 2020 at 2:42 PM Paolo Bonzini <pbonzini@redhat.com> wrote:
> > >
> > > On 23/11/20 20:22, Oliver Upton wrote:
> > > > The pi_pending bit works rather well as it is only a hint to KVM that it
> > > > may owe the guest a posted-interrupt completion. However, if we were to
> > > > set the guest's nested PINV as pending in the L1 IRR it'd be challenging
> > > > to infer whether or not it should actually be injected in L1 or result
> > > > in posted-interrupt processing for L2.
> > >
> > > Stupid question: why does it matter?  The behavior when the PINV is
> > > delivered does not depend on the time it enters the IRR, only on the
> > > time that it enters ISR.  If that happens while the vCPU while in L2, it
> > > would trigger posted interrupt processing; if PINV moves to ISR while in
> > > L1, it would be delivered normally as an interrupt.
> > >
> > > There are various special cases but they should fall in place.  For
> > > example, if PINV is delivered during L1 vmentry (with IF=0), it would be
> > > delivered at the next inject_pending_event when the VMRUN vmexit is
> > > processed and interrupts are unmasked.
> > >
> > > The tricky case is when L0 tries to deliver the PINV to L1 as a posted
> > > interrupt, i.e. in vmx_deliver_nested_posted_interrupt.  Then the
> > >
> > >                  if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true))
> > >                          kvm_vcpu_kick(vcpu);
> > >
> > > needs a tweak to fall back to setting the PINV in L1's IRR:
> > >
> > >                  if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true)) {
> > >                          /* set PINV in L1's IRR */
> > >                         kvm_vcpu_kick(vcpu);
> > >                 }
> >
> > Yeah, I think that's fair. Regardless, the pi_pending bit should've
> > only been set if the IPI was actually sent. Though I suppose
> 
> Didn't finish my thought :-/
> 
> Though I suppose pi_pending was set unconditionally (and skipped the
> IRR) since until recently KVM completely bungled handling the PINV
> correctly when in the L1 IRR.
> 
> >
> > > but you also have to do the same *in the PINV handler*
> > > sysvec_kvm_posted_intr_nested_ipi too, to handle the case where the
> > > L2->L0 vmexit races against sending the IPI.
> >
> > Indeed, there is a race but are we assured that the target vCPU thread
> > is scheduled on the target CPU when that IPI arrives?
> >
> > I believe there is a 1-to-many relationship here, which is why I said
> > each CPU would need to maintain a linked list of possible vCPUs to
> > iterate and find the intended recipient.

Ya, the concern is that it's theoretically possible for the PINV to arrive in L0
after a different vCPU has been loaded (or even multiple different vCPUs).      
E.g. if the sending pCPU is hit with an NMI after checking vcpu->mode, and the  
NMI runs for some absurd amount of time.  If that happens, the PINV handler     
won't know which vCPU(s) should get an IRQ injected into L1 without additional  
tracking.  KVM would need to set something like nested.pi_pending before doing  
kvm_vcpu_trigger_posted_interrupt(), i.e. nothing really changes, it just gets  
more complex.

> > The process of removing vCPUs from the list where we caught the IPI
> > in L0 is quite clear, but it doesn't seem like we could ever know to
> > remove vCPUs from the list when hardware catches that IPI.

It's probably possible by shadowing the PI descriptor, but doing so would likely
wipe out the perf benefits of nested PI.

That being said, if we don't care about strictly adhering to the spec (sorry in
advance Jim), I think we could avoid nested.pi_pending if we're ok with KVM
processing virtual interrupts that technically shouldn't happen, yet.  E.g. if
the L0 PINV handler consumes vIRR bits that were set after the last PI from L1.
KVM already does that with nested.pi_pending, so it might not be the worst idea
in the world since the existing nested PI implementation mostly works.
Sean Christopherson Nov. 24, 2020, 3:19 a.m. UTC | #11
On Tue, Nov 24, 2020 at 01:55:15AM +0000, Sean Christopherson wrote:
> On Mon, Nov 23, 2020 at 04:13:49PM -0800, Oliver Upton wrote:
> > On Mon, Nov 23, 2020 at 4:10 PM Oliver Upton <oupton@google.com> wrote:
> > >
> > > On Mon, Nov 23, 2020 at 2:42 PM Paolo Bonzini <pbonzini@redhat.com> wrote:
> > > >
> > > > On 23/11/20 20:22, Oliver Upton wrote:
> > > > > The pi_pending bit works rather well as it is only a hint to KVM that it
> > > > > may owe the guest a posted-interrupt completion. However, if we were to
> > > > > set the guest's nested PINV as pending in the L1 IRR it'd be challenging
> > > > > to infer whether or not it should actually be injected in L1 or result
> > > > > in posted-interrupt processing for L2.
> > > >
> > > > Stupid question: why does it matter?  The behavior when the PINV is
> > > > delivered does not depend on the time it enters the IRR, only on the
> > > > time that it enters ISR.  If that happens while the vCPU while in L2, it
> > > > would trigger posted interrupt processing; if PINV moves to ISR while in
> > > > L1, it would be delivered normally as an interrupt.
> > > >
> > > > There are various special cases but they should fall in place.  For
> > > > example, if PINV is delivered during L1 vmentry (with IF=0), it would be
> > > > delivered at the next inject_pending_event when the VMRUN vmexit is
> > > > processed and interrupts are unmasked.
> > > >
> > > > The tricky case is when L0 tries to deliver the PINV to L1 as a posted
> > > > interrupt, i.e. in vmx_deliver_nested_posted_interrupt.  Then the
> > > >
> > > >                  if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true))
> > > >                          kvm_vcpu_kick(vcpu);
> > > >
> > > > needs a tweak to fall back to setting the PINV in L1's IRR:
> > > >
> > > >                  if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true)) {
> > > >                          /* set PINV in L1's IRR */
> > > >                         kvm_vcpu_kick(vcpu);
> > > >                 }
> > >
> > > Yeah, I think that's fair. Regardless, the pi_pending bit should've
> > > only been set if the IPI was actually sent. Though I suppose
> > 
> > Didn't finish my thought :-/
> > 
> > Though I suppose pi_pending was set unconditionally (and skipped the
> > IRR) since until recently KVM completely bungled handling the PINV
> > correctly when in the L1 IRR.
> > 
> > >
> > > > but you also have to do the same *in the PINV handler*
> > > > sysvec_kvm_posted_intr_nested_ipi too, to handle the case where the
> > > > L2->L0 vmexit races against sending the IPI.
> > >
> > > Indeed, there is a race but are we assured that the target vCPU thread
> > > is scheduled on the target CPU when that IPI arrives?
> > >
> > > I believe there is a 1-to-many relationship here, which is why I said
> > > each CPU would need to maintain a linked list of possible vCPUs to
> > > iterate and find the intended recipient.
> 
> Ya, the concern is that it's theoretically possible for the PINV to arrive in L0
> after a different vCPU has been loaded (or even multiple different vCPUs).      
> E.g. if the sending pCPU is hit with an NMI after checking vcpu->mode, and the  
> NMI runs for some absurd amount of time.  If that happens, the PINV handler     
> won't know which vCPU(s) should get an IRQ injected into L1 without additional  
> tracking.  KVM would need to set something like nested.pi_pending before doing  
> kvm_vcpu_trigger_posted_interrupt(), i.e. nothing really changes, it just gets  
> more complex.

If we do want to do something fancy with the L0 PINV handler, I think we could
avoid a list by having the dest vCPU busy wait before exiting guest_mode if
there is one or more PINV IPIs that are in the process of being sent, and then
wait again in vcpu_put() for the PINV to be handled.  The latter in particular
would rarely come into play, i.e. shouldn't impact performance.  That would
prevent having a somewhat unbounded et of vCPUs that may or may not have an
oustanding PINV.
Paolo Bonzini Nov. 24, 2020, 11:09 a.m. UTC | #12
On 24/11/20 01:10, Oliver Upton wrote:
>> but you also have to do the same*in the PINV handler*
>> sysvec_kvm_posted_intr_nested_ipi too, to handle the case where the
>> L2->L0 vmexit races against sending the IPI.
> Indeed, there is a race but are we assured that the target vCPU thread
> is scheduled on the target CPU when that IPI arrives?

This would only happen if the source vCPU saw vcpu->mode == 
IN_GUEST_MODE for the target vCPU.  Thus there are three cases:

1) the vCPU is in non-root mode.  This is easy. :)

2) the vCPU hasn't entered the VM yet.  Then posted interrupt IPIs are 
delayed after guest entry and ensured to result in virtual interrupt 
delivery, just like case 1.

3) the vCPU has left the VM but it hasn't reached

         vcpu->mode = OUTSIDE_GUEST_MODE;
         smp_wmb();

yet.  Then the interrupt will be right after that moment, at.

         kvm_before_interrupt(vcpu);
         local_irq_enable();
         ++vcpu->stat.exits;
         local_irq_disable();
         kvm_after_interrupt(vcpu);

Anything else will cause kvm_vcpu_trigger_posted_interrupt(vcpu, true) 
to return false instead of sending an IPI.

Paolo
Paolo Bonzini Nov. 24, 2020, 11:39 a.m. UTC | #13
On 24/11/20 02:55, Sean Christopherson wrote:
>>> I believe there is a 1-to-many relationship here, which is why I said
>>> each CPU would need to maintain a linked list of possible vCPUs to
>>> iterate and find the intended recipient.
> 
> Ya, the concern is that it's theoretically possible for the PINV to arrive in L0
> after a different vCPU has been loaded (or even multiple different vCPUs).
> E.g. if the sending pCPU is hit with an NMI after checking vcpu->mode, and the
> NMI runs for some absurd amount of time.  If that happens, the PINV handler
> won't know which vCPU(s) should get an IRQ injected into L1 without additional
> tracking.  KVM would need to set something like nested.pi_pending before doing
> kvm_vcpu_trigger_posted_interrupt(), i.e. nothing really changes, it just gets
> more complex.

Ah, gotcha.  What if IN_GUEST_MODE/OUTSIDE_GUEST_MODE was replaced by a 
generation count?  Then you reread vcpu->mode after sending the IPI, and 
retry if it does not match.

To guarantee atomicity, the OUTSIDE_GUEST_MODE IN_GUEST_MODE 
EXITING_GUEST_MODE READING_SHADOW_PAGE_TABLES values would remain in the 
bottom 2 bits.  That is, the vcpu->mode accesses like

	vcpu->mode = IN_GUEST_MODE;

	vcpu->mode = OUTSIDE_GUEST_MODE;

	smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);

	smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);

	return cmpxchg(&vcpu->mode, IN_GUEST_MODE, EXITING_GUEST_MODE);

becoming

	enum {
		OUTSIDE_GUEST_MODE,
		IN_GUEST_MODE,
		EXITING_GUEST_MODE,
		READING_SHADOW_PAGE_TABLES,
		GUEST_MODE_MASK = 3,
	};

	vcpu->mode = (vcpu->mode | GUEST_MODE_MASK) + 1 + IN_GUEST_MODE;

	vcpu->mode &= ~GUEST_MODE_MASK;

	smp_store_mb(vcpu->mode, vcpu->mode|READING_SHADOW_PAGE_TABLES);

	smp_store_release(&vcpu->mode, vcpu->mode & ~GUEST_MODE_MASK);

	int x = READ_ONCE(vcpu->mode);
	do {
		if ((x & GUEST_MODE_MASK) != IN_GUEST_MODE)
			return x & GUEST_MODE_MASK;
	} while (!try_cmpxchg(&vcpu->mode, &x,
			      x ^ IN_GUEST_MODE ^ EXITING_GUEST_MODE))
	return IN_GUEST_MODE;

You could still get spurious posted interrupt IPIs, but the IPI handler 
need not do anything at all and that is much better.

> if we're ok with KVM
> processing virtual interrupts that technically shouldn't happen, yet.  E.g. if
> the L0 PINV handler consumes vIRR bits that were set after the last PI from L1.

I actually find it curious that the spec promises posted interrupt 
processing to be triggered only by the arrival of the posted interrupt 
IPI.  Why couldn't the processor in principle snoop for the address of 
the ON bit instead, similar to an MWAIT?

But even without that, I don't think the spec promises that kind of 
strict ordering with respect to what goes on in the source.  Even though 
posted interrupt processing is atomic with the acknowledgement of the 
posted interrupt IPI, the spec only promises that the PINV triggers an 
_eventual_ scan of PID.PIR when the interrupt controller delivers an 
unmasked external interrupt to the destination CPU.  You can still have 
something like

	set PID.PIR[100]
	set PID.ON
					processor starts executing a
					 very slow instruction...
	send PINV
	set PID.PIR[200]
					acknowledge PINV

and then vector 200 would be delivered before vector 100.  Of course 
with nested PI the effect would be amplified, but it's possible even on 
bare metal.

Paolo
Sean Christopherson Nov. 24, 2020, 9:22 p.m. UTC | #14
On Tue, Nov 24, 2020, Paolo Bonzini wrote:
> On 24/11/20 02:55, Sean Christopherson wrote:
> > > > I believe there is a 1-to-many relationship here, which is why I said
> > > > each CPU would need to maintain a linked list of possible vCPUs to
> > > > iterate and find the intended recipient.
> > 
> > Ya, the concern is that it's theoretically possible for the PINV to arrive in L0
> > after a different vCPU has been loaded (or even multiple different vCPUs).
> > E.g. if the sending pCPU is hit with an NMI after checking vcpu->mode, and the
> > NMI runs for some absurd amount of time.  If that happens, the PINV handler
> > won't know which vCPU(s) should get an IRQ injected into L1 without additional
> > tracking.  KVM would need to set something like nested.pi_pending before doing
> > kvm_vcpu_trigger_posted_interrupt(), i.e. nothing really changes, it just gets
> > more complex.
> 
> Ah, gotcha.  What if IN_GUEST_MODE/OUTSIDE_GUEST_MODE was replaced by a
> generation count?  Then you reread vcpu->mode after sending the IPI, and
> retry if it does not match.
> 
> To guarantee atomicity, the OUTSIDE_GUEST_MODE IN_GUEST_MODE
> EXITING_GUEST_MODE READING_SHADOW_PAGE_TABLES values would remain in the
> bottom 2 bits.  That is, the vcpu->mode accesses like
> 
> 	vcpu->mode = IN_GUEST_MODE;
> 
> 	vcpu->mode = OUTSIDE_GUEST_MODE;
> 
> 	smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
> 
> 	smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
> 
> 	return cmpxchg(&vcpu->mode, IN_GUEST_MODE, EXITING_GUEST_MODE);
> 
> becoming
> 
> 	enum {
> 		OUTSIDE_GUEST_MODE,
> 		IN_GUEST_MODE,
> 		EXITING_GUEST_MODE,
> 		READING_SHADOW_PAGE_TABLES,
> 		GUEST_MODE_MASK = 3,
> 	};
> 
> 	vcpu->mode = (vcpu->mode | GUEST_MODE_MASK) + 1 + IN_GUEST_MODE;
> 
> 	vcpu->mode &= ~GUEST_MODE_MASK;
> 
> 	smp_store_mb(vcpu->mode, vcpu->mode|READING_SHADOW_PAGE_TABLES);
> 
> 	smp_store_release(&vcpu->mode, vcpu->mode & ~GUEST_MODE_MASK);
> 
> 	int x = READ_ONCE(vcpu->mode);
> 	do {
> 		if ((x & GUEST_MODE_MASK) != IN_GUEST_MODE)
> 			return x & GUEST_MODE_MASK;
> 	} while (!try_cmpxchg(&vcpu->mode, &x,
> 			      x ^ IN_GUEST_MODE ^ EXITING_GUEST_MODE))
> 	return IN_GUEST_MODE;
> 
> You could still get spurious posted interrupt IPIs, but the IPI handler need
> not do anything at all and that is much better.

This doesn't handle the case where the PINV arrives in L0 after VM-Exit but
before the vCPU clears IN_GUEST_MODE.  The sender will have seen IN_GUEST_MODE
and so won't retry the IPI, but hardware didn't process the PINV as a
posted-interrupt.  I.e. the L0 PINV handler still needs an indicator a la
nested.pi_pending to know that it should stuff an IRQ into L1's vIRR.

> > if we're ok with KVM
> > processing virtual interrupts that technically shouldn't happen, yet.  E.g. if
> > the L0 PINV handler consumes vIRR bits that were set after the last PI from L1.
> 
> I actually find it curious that the spec promises posted interrupt
> processing to be triggered only by the arrival of the posted interrupt IPI.
> Why couldn't the processor in principle snoop for the address of the ON bit
> instead, similar to an MWAIT?

It would lead to false positives and missed IRQs.  PI processing would fire on
writing the PI _cache line_, not just on writes to PI.ON.  I suspect MONITOR is
also triggered on request-for-EXLUSIVE and not just writes, i.e. on speculative
behavior, but I forget if that's actually the case.

Regardless, a write to any part of the PI would trip the monitoring, and then
all subsequent writes would be missed, e.g. other package writes PI.IRR then
PI.ON, CPU PI processing triggers on the PI.IRR write but not PI.ON write.  The
target CPU (the running vCPU) would have to constantly rearm the monitor, but
even then there would always be a window where a write would get missed.

> But even without that, I don't think the spec promises that kind of strict
> ordering with respect to what goes on in the source.  Even though posted
> interrupt processing is atomic with the acknowledgement of the posted
> interrupt IPI, the spec only promises that the PINV triggers an _eventual_
> scan of PID.PIR when the interrupt controller delivers an unmasked external
> interrupt to the destination CPU.  You can still have something like
> 
> 	set PID.PIR[100]
> 	set PID.ON
> 					processor starts executing a
> 					 very slow instruction...
> 	send PINV
> 	set PID.PIR[200]
> 					acknowledge PINV
> 
> and then vector 200 would be delivered before vector 100.  Of course with
> nested PI the effect would be amplified, but it's possible even on bare
> metal.

Jim was concerned that L1 could poll the PID to determine whether or not
PID.PIR[200] should be seen in L2.  The whole PIR is copied to the vIRR after
PID.ON is cleared the auto-EOI is done, and the read->clear is atomic.  So the
above sequence where PINV is acknowledge after PID.PIR[200] is legal, but
processing PIR bits that are set after the PIR is observed to be cleared would
be illegal.  E.g. if L1 did this

	set PID.PIR[100]
	set PID.ON
	send PINV
	while (PID.PIR)
	set PID.PIR[200]
	set PID.ON

then L2 should never observe vector 200.  KVM violates this because
nested.pi_pending is left set even if PINV is handled as a posted interrupt, and
KVM's processing of nested.pi_pending will see the second PID.ON and incorrectly
do PI processing in software.  This is the part that is likely impossible to
solve without shadowing the PID (which, for the record, I have zero desire to do).

It seems extremely unlikely any guest will puke on the above, I can't imagine
there's for setting a PID.PIR + PID.ON without triggering PINV, but it's
technically bad behavior in KVM.
Paolo Bonzini Nov. 25, 2020, 12:10 a.m. UTC | #15
On 24/11/20 22:22, Sean Christopherson wrote:
>> What if IN_GUEST_MODE/OUTSIDE_GUEST_MODE was replaced by a
>> generation count?  Then you reread vcpu->mode after sending the IPI, and
>> retry if it does not match.
> The sender will have seen IN_GUEST_MODE and so won't retry the IPI,
> but hardware didn't process the PINV as a posted-interrupt.
Uff, of course.

That said, it may still be a good idea to keep pi_pending only as a very 
short-lived flag only to handle this case, and maybe not even need the 
generation count (late here so put up with me if it's wrong :)).  The 
flag would not have to live past vmx_vcpu_run even, the vIRR[PINV] bit 
would be the primary marker that a nested posted interrupt is pending.

>>> if we're ok with KVM
>>> processing virtual interrupts that technically shouldn't happen, yet.  E.g. if
>>> the L0 PINV handler consumes vIRR bits that were set after the last PI from L1.
>>
>> I actually find it curious that the spec promises posted interrupt
>> processing to be triggered only by the arrival of the posted interrupt IPI.
>> Why couldn't the processor in principle snoop for the address of the ON bit
>> instead, similar to an MWAIT?
> 
> It would lead to false positives and missed IRQs.

Not to missed IRQs---false positives on the monitor would be possible, 
but you would still have to send a posted interrupt IPI.  The weird 
promise is that the PINV interrupt is the _only_ trigger for posted 
interrupts.

>> But even without that, I don't think the spec promises that kind of strict
>> ordering with respect to what goes on in the source.  Even though posted
>> interrupt processing is atomic with the acknowledgement of the posted
>> interrupt IPI, the spec only promises that the PINV triggers an _eventual_
>> scan of PID.PIR when the interrupt controller delivers an unmasked external
>> interrupt to the destination CPU.  You can still have something like
>>
>> 	set PID.PIR[100]
>> 	set PID.ON
>> 					processor starts executing a
>> 					 very slow instruction...
>> 	send PINV
>> 	set PID.PIR[200]
>> 					acknowledge PINV
>>
>> and then vector 200 would be delivered before vector 100.  Of course with
>> nested PI the effect would be amplified, but it's possible even on bare
>> metal.
> 
> Jim was concerned that L1 could poll the PID to determine whether or not
> PID.PIR[200] should be seen in L2.  The whole PIR is copied to the vIRR after
> PID.ON is cleared the auto-EOI is done, and the read->clear is atomic.  So the
> above sequence where PINV is acknowledge after PID.PIR[200] is legal, but
> processing PIR bits that are set after the PIR is observed to be cleared would
> be illegal.

That would be another case of the unnecessarily specific promise above.

> E.g. if L1 did this
> 
> 	set PID.PIR[100]
> 	set PID.ON
> 	send PINV
> 	while (PID.PIR)
> 	set PID.PIR[200]
> 	set PID.ON
>
> This is the part that is likely impossible to
> solve without shadowing the PID (which, for the record, I have zero desire to do).

Neither do I. :)  But technically the SDM doesn't promise reading the 
whole 256 bits at the same time.  Perhaps that's the only way it can 
work in practice due to the cache coherency protocols, but the SDM only 
promises atomicity of the read and clear of "a single PIR bit (or group 
of bits)".  So there's in principle no reason why the target CPU 
couldn't clear PID.PIR[100], and then L1 would sneak in and set 
PID.PIR[200].

Paolo

> It seems extremely unlikely any guest will puke on the above, I can't imagine
> there's for setting a PID.PIR + PID.ON without triggering PINV, but it's
> technically bad behavior in KVM.
>
Sean Christopherson Nov. 25, 2020, 1:14 a.m. UTC | #16
On Wed, Nov 25, 2020, Paolo Bonzini wrote:
> On 24/11/20 22:22, Sean Christopherson wrote:
> > > What if IN_GUEST_MODE/OUTSIDE_GUEST_MODE was replaced by a
> > > generation count?  Then you reread vcpu->mode after sending the IPI, and
> > > retry if it does not match.
> > The sender will have seen IN_GUEST_MODE and so won't retry the IPI,
> > but hardware didn't process the PINV as a posted-interrupt.
> Uff, of course.
> 
> That said, it may still be a good idea to keep pi_pending only as a very
> short-lived flag only to handle this case, and maybe not even need the
> generation count (late here so put up with me if it's wrong :)).  The flag
> would not have to live past vmx_vcpu_run even, the vIRR[PINV] bit would be
> the primary marker that a nested posted interrupt is pending.

I 100% agree that would be ideal for nested state migration, as it would avoid
polluting the ABI with nested.pi_pending, but the downside is that it would mean
KVM could deliver a physical interrupt twice; once to L2 and once to L1.  E.g.

	while (READ_ONCE(vmx->nested.pi_pending) && PID.ON) {
		vmx->nested.pi_pending = false;
		vIRR.PINV = 1;
	}

would incorrectly set vIRR.PINV in the case where hardware handled the PI, and
that could result in L1 seeing the interrupt if a nested exit occured before KVM
processed vIRR.PINV for L2.  Note, without PID.ON, the behavior would be really
bad as KVM would set vIRR.PINV *every* time hardware handled the PINV.

The current behavior just means KVM is more greedy with respect to processing
PID.PIR than it technically should be.  Not sure if that distinction is worth
carrying nested.pi_pending.

> > > > if we're ok with KVM
> > > > processing virtual interrupts that technically shouldn't happen, yet.  E.g. if
> > > > the L0 PINV handler consumes vIRR bits that were set after the last PI from L1.
> > > 
> > > I actually find it curious that the spec promises posted interrupt
> > > processing to be triggered only by the arrival of the posted interrupt IPI.
> > > Why couldn't the processor in principle snoop for the address of the ON bit
> > > instead, similar to an MWAIT?
> > 
> > It would lead to false positives and missed IRQs.
> 
> Not to missed IRQs---false positives on the monitor would be possible, but
> you would still have to send a posted interrupt IPI.  The weird promise is
> that the PINV interrupt is the _only_ trigger for posted interrupts.

Ah, I misunderstood the original "only".  I suspect the primary reason is that
it would cost uops to do the snoop thing and would be inefficient in practice.
The entire PID is guaranteed to be in a single cache line, and most (all?) flows
will write PIR before ON.  So snooping the PID will detect the PIR write, bounce
the cache line by reading it, burn some uops checking that ON isn't set[*], and
then do ???

[*] The pseudocode in the SDM doesn't actually state that the CPU checks PID.ON,
    it only says it clears PID.ON, i.e. assumes that PID.ON is set.  That seems
    like an SDM bug.

> > > But even without that, I don't think the spec promises that kind of strict
> > > ordering with respect to what goes on in the source.  Even though posted
> > > interrupt processing is atomic with the acknowledgement of the posted
> > > interrupt IPI, the spec only promises that the PINV triggers an _eventual_
> > > scan of PID.PIR when the interrupt controller delivers an unmasked external
> > > interrupt to the destination CPU.  You can still have something like
> > > 
> > > 	set PID.PIR[100]
> > > 	set PID.ON
> > > 					processor starts executing a
> > > 					 very slow instruction...
> > > 	send PINV
> > > 	set PID.PIR[200]
> > > 					acknowledge PINV
> > > 
> > > and then vector 200 would be delivered before vector 100.  Of course with
> > > nested PI the effect would be amplified, but it's possible even on bare
> > > metal.
> > 
> > Jim was concerned that L1 could poll the PID to determine whether or not
> > PID.PIR[200] should be seen in L2.  The whole PIR is copied to the vIRR after
> > PID.ON is cleared the auto-EOI is done, and the read->clear is atomic.  So the
> > above sequence where PINV is acknowledge after PID.PIR[200] is legal, but
> > processing PIR bits that are set after the PIR is observed to be cleared would
> > be illegal.
> 
> That would be another case of the unnecessarily specific promise above.
> 
> > E.g. if L1 did this
> > 
> > 	set PID.PIR[100]
> > 	set PID.ON
> > 	send PINV
> > 	while (PID.PIR)
> > 	set PID.PIR[200]
> > 	set PID.ON
> > 
> > This is the part that is likely impossible to
> > solve without shadowing the PID (which, for the record, I have zero desire to do).
> 
> Neither do I. :)  But technically the SDM doesn't promise reading the whole
> 256 bits at the same time.

Hrm, the wording is poor, but my interpretation of this blurb is that the CPU
somehow has a death grip on the PID cache line while it's reading and clearing
the PIR.

  5. The logical processor performs a logical-OR of PIR into VIRR and clears PIR.
     No other agent can read or write a PIR bit (or group of bits) between the
     time it is read (to determine what to OR into VIRR) and when it is cleared.

> Perhaps that's the only way it can work in
> practice due to the cache coherency protocols, but the SDM only promises
> atomicity of the read and clear of "a single PIR bit (or group of bits)".
> So there's in principle no reason why the target CPU couldn't clear
> PID.PIR[100], and then L1 would sneak in and set PID.PIR[200].
> 
> Paolo
> 
> > It seems extremely unlikely any guest will puke on the above, I can't imagine
> > there's for setting a PID.PIR + PID.ON without triggering PINV, but it's
> > technically bad behavior in KVM.
> > 
>
Paolo Bonzini Nov. 25, 2020, 5 p.m. UTC | #17
On 25/11/20 02:14, Sean Christopherson wrote:
>> The flag
>> would not have to live past vmx_vcpu_run even, the vIRR[PINV] bit would be
>> the primary marker that a nested posted interrupt is pending.
>
> 	while (READ_ONCE(vmx->nested.pi_pending) && PID.ON) {
> 		vmx->nested.pi_pending = false;
> 		vIRR.PINV = 1;
> 	}
> 
> would incorrectly set vIRR.PINV in the case where hardware handled the PI, and
> that could result in L1 seeing the interrupt if a nested exit occured before KVM
> processed vIRR.PINV for L2.  Note, without PID.ON, the behavior would be really
> bad as KVM would set vIRR.PINV *every* time hardware handled the PINV.

It doesn't have to be a while loop, since by the time we get here 
vcpu->mode is not IN_GUEST_MODE anymore.  To avoid the double PINV 
delivery, we could process the PID as in 
vmx_complete_nested_posted_interrupt in this particular case---but 
vmx_complete_nested_posted_interrupt would be moved from vmentry to 
vmexit, and the common case would use vIRR.PINV instead.  There would 
still be double processing, but it would solve the migration problem in 
a relatively elegant manner.

>> The weird promise is
>> that the PINV interrupt is the _only_ trigger for posted interrupts.
> 
> Ah, I misunderstood the original "only".  I suspect the primary reason is that
> it would cost uops to do the snoop thing and would be inefficient in practice.

Yes, I agree.  But again, the spec seems to be unnecessarily restrictive.

>>> This is the part that is likely impossible to
>>> solve without shadowing the PID (which, for the record, I have zero desire to do).
>> Neither do I.:)   But technically the SDM doesn't promise reading the whole
>> 256 bits at the same time.
>
> Hrm, the wording is poor, but my interpretation of this blurb is that the CPU
> somehow has a death grip on the PID cache line while it's reading and clearing
> the PIR.
> 
>    5. The logical processor performs a logical-OR of PIR into VIRR and clears PIR.
>       No other agent can read or write a PIR bit (or group of bits) between the
>       time it is read (to determine what to OR into VIRR) and when it is cleared.

Yeah, that's the part I interpreted as other processors possibly being 
able to see a partially updated version.  Of course in practice the 
processor will be doing everything atomically, but the more restrictive 
reading of the spec all but precludes a software implementation.

Paolo
Sean Christopherson Nov. 25, 2020, 6:32 p.m. UTC | #18
-Idan to stop getting bounces.

On Wed, Nov 25, 2020, Paolo Bonzini wrote:
> On 25/11/20 02:14, Sean Christopherson wrote:
> > > The flag
> > > would not have to live past vmx_vcpu_run even, the vIRR[PINV] bit would be
> > > the primary marker that a nested posted interrupt is pending.
> > 
> > 	while (READ_ONCE(vmx->nested.pi_pending) && PID.ON) {
> > 		vmx->nested.pi_pending = false;
> > 		vIRR.PINV = 1;
> > 	}
> > 
> > would incorrectly set vIRR.PINV in the case where hardware handled the PI, and
> > that could result in L1 seeing the interrupt if a nested exit occured before KVM
> > processed vIRR.PINV for L2.  Note, without PID.ON, the behavior would be really
> > bad as KVM would set vIRR.PINV *every* time hardware handled the PINV.
> 
> It doesn't have to be a while loop, since by the time we get here vcpu->mode
> is not IN_GUEST_MODE anymore.

Hrm, bad loop logic on my part.  I'm pretty sure the exiting vCPU needs to wait
for all senders to finish their sequence, otherwise pi_pending could be left
set, but spinning on pi_pending is wrong.  Your generation counter thing may
also work, but that made my brain hurt too much to work through the logic. :-)

Something like this?

static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
						int vector)
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);

	if (is_guest_mode(vcpu) &&
	    vector == vmx->nested.posted_intr_nv) {
		/* Write a comment. */
		vmx->nested.pi_sending_count++;
		smp_wmb();
		if (kvm_vcpu_trigger_posted_interrupt(vcpu, true)) {
			vmx->nested.pi_pending = true;
		} else {
			<set PINV in L1 vIRR>
			kvm_make_request(KVM_REQ_EVENT, vcpu);
			kvm_vcpu_kick(vcpu);
		}
		smp_wmb();
		vmx->nested.pi_sending_count--;
		return 0;
	}
	return -1;
}

static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
{
	...

	/* The actual VMENTER/EXIT is in the .noinstr.text section. */
	vmx_vcpu_enter_exit(vcpu, vmx);

	...

	if (is_guest_mode(vcpu) {
		while (READ_ONCE(vmx->nested.pi_sending_count));

		vmx_complete_nested_posted_interrupt(vcpu);
	}

	...
}

> To avoid the double PINV delivery, we could process the PID as in
> vmx_complete_nested_posted_interrupt in this particular case---but
> vmx_complete_nested_posted_interrupt would be moved from vmentry to vmexit,
> and the common case would use vIRR.PINV instead.  There would still be double
> processing, but it would solve the migration problem in a relatively elegant
> manner.

I like this idea, a lot.  I'm a-ok with KVM processing more PIRs than the
SDM may or may not technically allow.

Jim, any objections?
Paolo Bonzini Nov. 26, 2020, 1:13 p.m. UTC | #19
On 25/11/20 19:32, Sean Christopherson wrote:
> I'm pretty sure the exiting vCPU needs to wait
> for all senders to finish their sequence, otherwise pi_pending could be left
> set, but spinning on pi_pending is wrong.

What if you set it before?

static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
						int vector)
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);

	if (is_guest_mode(vcpu) &&
	    vector == vmx->nested.posted_intr_nv) {
		/*
		 * Set pi_pending after ON.
		 */
		smp_store_release(&vmx->nested.pi_pending, true);
		if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true)) {
			/*
			 * The guest was not running, let's try again
			 * on the next vmentry.
			 */
			<set PINV in L1 vIRR>
			kvm_make_request(KVM_REQ_EVENT, vcpu);
			kvm_vcpu_kick(vcpu);
			vmx->nested.pi_pending = false;
		}
		write_seqcount_end(&vmx->nested.pi_pending_sc);
		return 0;
	}
	return -1;
}

On top of this:

- kvm_x86_ops.hwapic_irr_update can be deleted.  It is already done 
unconditionally by vmx_sync_pir_to_irr before every vmentry.  This gives 
more freedom in changing vmx_sync_pir_to_irr and vmx_hwapic_irr_update.

- VCPU entry must check if max_irr == vmx->nested.posted_intr_nv, and if 
so send a POSTED_INTR_NESTED_VECTOR self-IPI.

Combining both (and considering that AMD doesn't do anything interesting 
in vmx_sync_pir_to_irr), I would move the whole call to 
vmx_sync_pir_to_irr from x86.c to vmx/vmx.c, so that we know that 
vmx_hwapic_irr_update is called with interrupts disabled and right 
before vmentry:

  static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
  {
	...
-	vmx_hwapic_irr_update(vcpu, max_irr);
         return max_irr;
  }

-static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
+static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu)
  {
+	int max_irr;
+
+	WARN_ON(!irqs_disabled());
+	max_irr = vmx_sync_pir_to_irr(vcpu);
         if (!is_guest_mode(vcpu))
                 vmx_set_rvi(max_irr);
+	else if (max_irr == vmx->nested.posted_intr_nv) {
+		...
+	}
  }

and in vmx_vcpu_run:

+	if (kvm_lapic_enabled(vcpu) && vcpu->arch.apicv_active)
+		vmx_hwapic_irr_update(vcpu);


If you agree, feel free to send this (without the else of course) as a 
separate cleanup patch immediately.

Paolo
Sean Christopherson Nov. 30, 2020, 7:14 p.m. UTC | #20
On Thu, Nov 26, 2020, Paolo Bonzini wrote:
> On 25/11/20 19:32, Sean Christopherson wrote:
> > I'm pretty sure the exiting vCPU needs to wait
> > for all senders to finish their sequence, otherwise pi_pending could be left
> > set, but spinning on pi_pending is wrong.
> 
> What if you set it before?

That doesn't help.  nested.pi_pending will be left set, with a valid vIRQ in the
PID, after vmx_vcpu_run() if kvm_vcpu_trigger_posted_interrupt() succeeds but
the PINV is delivered in the host.

Side topic, for the "wait" sequence to work, vmx_vcpu_run() would need to do
kvm_vcpu_exiting_guest_mode() prior to waiting for senders to completely their
sequence.

> 
> static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
> 						int vector)
> {
> 	struct vcpu_vmx *vmx = to_vmx(vcpu);
> 
> 	if (is_guest_mode(vcpu) &&
> 	    vector == vmx->nested.posted_intr_nv) {
> 		/*
> 		 * Set pi_pending after ON.
> 		 */
> 		smp_store_release(&vmx->nested.pi_pending, true);
> 		if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true)) {
> 			/*
> 			 * The guest was not running, let's try again
> 			 * on the next vmentry.
> 			 */
> 			<set PINV in L1 vIRR>
> 			kvm_make_request(KVM_REQ_EVENT, vcpu);
> 			kvm_vcpu_kick(vcpu);
> 			vmx->nested.pi_pending = false;
> 		}
> 		write_seqcount_end(&vmx->nested.pi_pending_sc);
> 		return 0;
> 	}
> 	return -1;
> }
> 
> On top of this:
> 
> - kvm_x86_ops.hwapic_irr_update can be deleted.  It is already done
> unconditionally by vmx_sync_pir_to_irr before every vmentry.  This gives
> more freedom in changing vmx_sync_pir_to_irr and vmx_hwapic_irr_update.

And would lower the barrier of entry for understanding this code :-)

> - VCPU entry must check if max_irr == vmx->nested.posted_intr_nv, and if so
> send a POSTED_INTR_NESTED_VECTOR self-IPI.

Hmm, there's also this snippet in vmx_sync_pir_to_irr() that needs to be dealt
with.  If the new max_irr in this case is the nested PI vector, KVM will bail
from the run loop instead of continuing on. 

	/*
	 * If we are running L2 and L1 has a new pending interrupt
	 * which can be injected, we should re-evaluate
	 * what should be done with this new L1 interrupt.
	 * If L1 intercepts external-interrupts, we should
	 * exit from L2 to L1. Otherwise, interrupt should be
	 * delivered directly to L2.
	 */
	if (is_guest_mode(vcpu) && max_irr_updated) {
		if (nested_exit_on_intr(vcpu))
			kvm_vcpu_exiting_guest_mode(vcpu);
		else
			kvm_make_request(KVM_REQ_EVENT, vcpu);
	}

> Combining both (and considering that AMD doesn't do anything interesting in
> vmx_sync_pir_to_irr), I would move the whole call to vmx_sync_pir_to_irr
> from x86.c to vmx/vmx.c, so that we know that vmx_hwapic_irr_update is
> called with interrupts disabled and right before vmentry:
> 
>  static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
>  {
> 	...
> -	vmx_hwapic_irr_update(vcpu, max_irr);
>         return max_irr;
>  }
> 
> -static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
> +static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu)

I would also vote to rename this helper; not sure what to call it, but for me
the current name doesn't help understand its purpose.

>  {
> +	int max_irr;
> +
> +	WARN_ON(!irqs_disabled());
> +	max_irr = vmx_sync_pir_to_irr(vcpu);
>         if (!is_guest_mode(vcpu))
>                 vmx_set_rvi(max_irr);
> +	else if (max_irr == vmx->nested.posted_intr_nv) {
> +		...
> +	}
>  }
> 
> and in vmx_vcpu_run:
> 
> +	if (kvm_lapic_enabled(vcpu) && vcpu->arch.apicv_active)
> +		vmx_hwapic_irr_update(vcpu);

And also drop the direct call to vmx_sync_pir_to_irr() in the fastpath exit.

> If you agree, feel free to send this (without the else of course) as a
> separate cleanup patch immediately.

Without what "else"?
Paolo Bonzini Nov. 30, 2020, 7:36 p.m. UTC | #21
On 30/11/20 20:14, Sean Christopherson wrote:
>> +	WARN_ON(!irqs_disabled());
>> +	max_irr = vmx_sync_pir_to_irr(vcpu);
>>          if (!is_guest_mode(vcpu))
>>                  vmx_set_rvi(max_irr);
>> +	else if (max_irr == vmx->nested.posted_intr_nv) {
>> +		...
>> +	}
>>   }
>>
>> and in vmx_vcpu_run:
>>
>> +	if (kvm_lapic_enabled(vcpu) && vcpu->arch.apicv_active)
>> +		vmx_hwapic_irr_update(vcpu);
> And also drop the direct call to vmx_sync_pir_to_irr() in the fastpath exit.
> 
>> If you agree, feel free to send this (without the else of course) as a
>> separate cleanup patch immediately.
> Without what "else"?

This one:

+	else if (max_irr == vmx->nested.posted_intr_nv) {
+		...

Paolo
Jim Mattson Dec. 3, 2020, 9:45 p.m. UTC | #22
On Tue, Nov 24, 2020 at 3:09 AM Paolo Bonzini <pbonzini@redhat.com> wrote:
>
> On 24/11/20 01:10, Oliver Upton wrote:
> >> but you also have to do the same*in the PINV handler*
> >> sysvec_kvm_posted_intr_nested_ipi too, to handle the case where the
> >> L2->L0 vmexit races against sending the IPI.
> > Indeed, there is a race but are we assured that the target vCPU thread
> > is scheduled on the target CPU when that IPI arrives?
>
> This would only happen if the source vCPU saw vcpu->mode ==
> IN_GUEST_MODE for the target vCPU.  Thus there are three cases:
>
> 1) the vCPU is in non-root mode.  This is easy. :)
>
> 2) the vCPU hasn't entered the VM yet.  Then posted interrupt IPIs are
> delayed after guest entry and ensured to result in virtual interrupt
> delivery, just like case 1.
>
> 3) the vCPU has left the VM but it hasn't reached
>
>          vcpu->mode = OUTSIDE_GUEST_MODE;
>          smp_wmb();
>
> yet.  Then the interrupt will be right after that moment, at.
>
>          kvm_before_interrupt(vcpu);
>          local_irq_enable();
>          ++vcpu->stat.exits;
>          local_irq_disable();
>          kvm_after_interrupt(vcpu);
>
> Anything else will cause kvm_vcpu_trigger_posted_interrupt(vcpu, true)
> to return false instead of sending an IPI.

This logic only applies to when the IPI is *sent*. AFAIK, there is no
bound on the amount of time that can elapse before the IPI is
*received*. (In fact, virtualization depends on this.)
Jim Mattson Dec. 3, 2020, 10:07 p.m. UTC | #23
On Tue, Nov 24, 2020 at 3:39 AM Paolo Bonzini <pbonzini@redhat.com> wrote:
>
> On 24/11/20 02:55, Sean Christopherson wrote:
> >>> I believe there is a 1-to-many relationship here, which is why I said
> >>> each CPU would need to maintain a linked list of possible vCPUs to
> >>> iterate and find the intended recipient.
> >
> > Ya, the concern is that it's theoretically possible for the PINV to arrive in L0
> > after a different vCPU has been loaded (or even multiple different vCPUs).
> > E.g. if the sending pCPU is hit with an NMI after checking vcpu->mode, and the
> > NMI runs for some absurd amount of time.  If that happens, the PINV handler
> > won't know which vCPU(s) should get an IRQ injected into L1 without additional
> > tracking.  KVM would need to set something like nested.pi_pending before doing
> > kvm_vcpu_trigger_posted_interrupt(), i.e. nothing really changes, it just gets
> > more complex.
>
> Ah, gotcha.  What if IN_GUEST_MODE/OUTSIDE_GUEST_MODE was replaced by a
> generation count?  Then you reread vcpu->mode after sending the IPI, and
> retry if it does not match.

Meanwhile, the IPI that you previously sent may have triggered posted
interrupt processing for the wrong vCPU.

Consider an overcommitted scenario, where each pCPU is running two
vCPUs. L1 initializes all of the L2 PIDs so that PIR[x] and PID.ON are
set. Then, it sends the VMCS12 NV IPI to one specific L2 vCPU. When L0
processes the VM-exit for sending the IPI, the target vCPU is running
IN_GUEST_MODE, so we send the vmcs02 NV to the corresponding pCPU.
But, by the time the IPI is received on the pCPU, a different L2 vCPU
is running. Oops. It seems that we have to pin the target vCPU to the
pCPU until the IPI arrives. But since it's the vmcs02 NV, we have no
way of knowing whether or not it has arrived.

If the requisite IPI delivery delays seem too absurd to contemplate,
consider pushing L0 down into L1.

> To guarantee atomicity, the OUTSIDE_GUEST_MODE IN_GUEST_MODE
> EXITING_GUEST_MODE READING_SHADOW_PAGE_TABLES values would remain in the
> bottom 2 bits.  That is, the vcpu->mode accesses like
>
>         vcpu->mode = IN_GUEST_MODE;
>
>         vcpu->mode = OUTSIDE_GUEST_MODE;
>
>         smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
>
>         smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
>
>         return cmpxchg(&vcpu->mode, IN_GUEST_MODE, EXITING_GUEST_MODE);
>
> becoming
>
>         enum {
>                 OUTSIDE_GUEST_MODE,
>                 IN_GUEST_MODE,
>                 EXITING_GUEST_MODE,
>                 READING_SHADOW_PAGE_TABLES,
>                 GUEST_MODE_MASK = 3,
>         };
>
>         vcpu->mode = (vcpu->mode | GUEST_MODE_MASK) + 1 + IN_GUEST_MODE;
>
>         vcpu->mode &= ~GUEST_MODE_MASK;
>
>         smp_store_mb(vcpu->mode, vcpu->mode|READING_SHADOW_PAGE_TABLES);
>
>         smp_store_release(&vcpu->mode, vcpu->mode & ~GUEST_MODE_MASK);
>
>         int x = READ_ONCE(vcpu->mode);
>         do {
>                 if ((x & GUEST_MODE_MASK) != IN_GUEST_MODE)
>                         return x & GUEST_MODE_MASK;
>         } while (!try_cmpxchg(&vcpu->mode, &x,
>                               x ^ IN_GUEST_MODE ^ EXITING_GUEST_MODE))
>         return IN_GUEST_MODE;
>
> You could still get spurious posted interrupt IPIs, but the IPI handler
> need not do anything at all and that is much better.
>
> > if we're ok with KVM
> > processing virtual interrupts that technically shouldn't happen, yet.  E.g. if
> > the L0 PINV handler consumes vIRR bits that were set after the last PI from L1.
>
> I actually find it curious that the spec promises posted interrupt
> processing to be triggered only by the arrival of the posted interrupt
> IPI.  Why couldn't the processor in principle snoop for the address of
> the ON bit instead, similar to an MWAIT?
>
> But even without that, I don't think the spec promises that kind of
> strict ordering with respect to what goes on in the source.  Even though
> posted interrupt processing is atomic with the acknowledgement of the
> posted interrupt IPI, the spec only promises that the PINV triggers an
> _eventual_ scan of PID.PIR when the interrupt controller delivers an
> unmasked external interrupt to the destination CPU.  You can still have
> something like
>
>         set PID.PIR[100]
>         set PID.ON
>                                         processor starts executing a
>                                          very slow instruction...
>         send PINV
>         set PID.PIR[200]
>                                         acknowledge PINV
>
> and then vector 200 would be delivered before vector 100.  Of course
> with nested PI the effect would be amplified, but it's possible even on
> bare metal.
>
> Paolo
>
diff mbox

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 4b8caff83dc7..71c240bbb685 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -994,6 +994,7 @@  struct kvm_x86_ops {
 	void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa);
 	void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
 	void (*complete_nested_posted_interrupt)(struct kvm_vcpu *vcpu);
+	bool (*cpu_has_nested_posted_interrupt)(struct kvm_vcpu *vcpu);
 	int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
 	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
 	int (*get_tdp_level)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 183f7aec29ca..a559c26d7bff 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -4458,6 +4458,11 @@  static void svm_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
 {
 }
 
+static bool svm_cpu_has_nested_posted_interrupt(struct kvm_vcpu *vcpu)
+{
+	return false;
+}
+
 /* Note: Currently only used by Hyper-V. */
 static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
 {
@@ -5593,6 +5598,8 @@  static int enable_smi_window(struct kvm_vcpu *vcpu)
 	.sync_pir_to_irr = svm_sync_pir_to_irr,
 	.complete_nested_posted_interrupt =
 		svm_complete_nested_posted_interrupt,
+	.cpu_has_nested_posted_interrupt =
+		svm_cpu_has_nested_posted_interrupt,
 	.apicv_post_state_restore = avic_post_state_restore,
 
 	.set_tss_addr = svm_set_tss_addr,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c2d012d9d16d..8b414dd5e07c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5141,6 +5141,17 @@  static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
 	}
 }
 
+static bool vmx_cpu_has_nested_posted_interrupt(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	return (vcpu->arch.apicv_active &&
+		is_guest_mode(vcpu) &&
+		vmx->nested.pi_pending &&
+		vmx->nested.pi_desc &&
+		pi_test_on(vmx->nested.pi_desc));
+}
+
 /*
  * Set up the vmcs's constant host-state fields, i.e., host-state fields that
  * will not change in the lifetime of the guest.
@@ -12142,6 +12153,8 @@  static int enable_smi_window(struct kvm_vcpu *vcpu)
 	.deliver_posted_interrupt = vmx_deliver_posted_interrupt,
 	.complete_nested_posted_interrupt =
 		vmx_complete_nested_posted_interrupt,
+	.cpu_has_nested_posted_interrupt =
+		vmx_cpu_has_nested_posted_interrupt,
 
 	.set_tss_addr = vmx_set_tss_addr,
 	.get_tdp_level = get_ept_level,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fa088951afc9..a840f2c9bd66 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -8542,7 +8542,8 @@  static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
 		return true;
 
 	if (kvm_arch_interrupt_allowed(vcpu) &&
-	    kvm_cpu_has_interrupt(vcpu))
+	    (kvm_cpu_has_interrupt(vcpu) ||
+	     kvm_x86_ops->cpu_has_nested_posted_interrupt(vcpu)))
 		return true;
 
 	if (kvm_hv_has_stimer_pending(vcpu))